feat: thinking-only prefill continuation for structured reasoning responses (#5931)

When the model produces structured reasoning (via API fields like .reasoning,
.reasoning_content, .reasoning_details) but no visible text content, append
the assistant message as prefill and continue the loop. The model sees its own
reasoning context on the next turn and produces the text portion.

Inspired by clawdbot's 'incomplete-text' recovery pattern. Up to 2 prefill
attempts before falling through to the existing '(empty)' terminal.

Key design decisions:
- Only triggers for structured reasoning (API fields), NOT inline <think> tags
- Prefill messages are popped on success to maintain strict role alternation
- _thinking_prefill marker stripped from all API message building paths
- Works across all providers: OpenAI (continuation), Anthropic (native prefill)

Verified with E2E tests: simulated thinking-only → real OpenRouter continuation
produces correct content. Also confirmed Qwen models consistently produce
structured-reasoning-only responses under token pressure.
This commit is contained in:
Teknium
2026-04-07 13:19:06 -07:00
committed by GitHub
parent 6e2f6a25a1
commit ab8f9c089e
2 changed files with 94 additions and 14 deletions

View File

@@ -5741,6 +5741,7 @@ class AIAgent:
api_msg.pop("reasoning", None)
api_msg.pop("finish_reason", None)
api_msg.pop("_flush_sentinel", None)
api_msg.pop("_thinking_prefill", None)
if _needs_sanitize:
self._sanitize_tool_calls_for_strict_api(api_msg)
api_messages.append(api_msg)
@@ -6664,7 +6665,7 @@ class AIAgent:
api_messages = []
for msg in messages:
api_msg = msg.copy()
for internal_field in ("reasoning", "finish_reason"):
for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
api_msg.pop(internal_field, None)
if _needs_sanitize:
self._sanitize_tool_calls_for_strict_api(api_msg)
@@ -6856,6 +6857,7 @@ class AIAgent:
self._empty_content_retries = 0
self._incomplete_scratchpad_retries = 0
self._codex_incomplete_retries = 0
self._thinking_prefill_retries = 0
self._last_content_with_tools = None
self._mute_post_response = False
self._surrogate_sanitized = False
@@ -7201,6 +7203,8 @@ class AIAgent:
# Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
if "finish_reason" in api_msg:
api_msg.pop("finish_reason")
# Strip internal thinking-prefill marker
api_msg.pop("_thinking_prefill", None)
# Strip Codex Responses API fields (call_id, response_item_id) for
# strict providers like Mistral, Fireworks, etc. that reject unknown fields.
# Uses new dicts so the internal messages list retains the fields
@@ -8735,6 +8739,15 @@ class AIAgent:
if clean:
self._vprint(f" ┊ 💬 {clean}")
# Pop thinking-only prefill message(s) before appending
# (tool-call path — same rationale as the final-response path).
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("_thinking_prefill")
):
messages.pop()
messages.append(assistant_msg)
# Close any open streaming display (response box, reasoning
@@ -8848,11 +8861,36 @@ class AIAgent:
self._response_was_previewed = True
break
# Reasoning-only response: the model produced thinking
# but no visible content. This is a valid response —
# keep reasoning in its own field and set content to
# "(empty)" so every provider accepts the message.
# No retries needed.
# ── Thinking-only prefill continuation ──────────
# The model produced structured reasoning (via API
# fields) but no visible text content. Rather than
# giving up, append the assistant message as-is and
# continue — the model will see its own reasoning
# on the next turn and produce the text portion.
# Inspired by clawdbot's "incomplete-text" recovery.
_has_structured = bool(
getattr(assistant_message, "reasoning", None)
or getattr(assistant_message, "reasoning_content", None)
or getattr(assistant_message, "reasoning_details", None)
)
if _has_structured and self._thinking_prefill_retries < 2:
self._thinking_prefill_retries += 1
self._vprint(
f"{self.log_prefix}↻ Thinking-only response — "
f"prefilling to continue "
f"({self._thinking_prefill_retries}/2)"
)
interim_msg = self._build_assistant_message(
assistant_message, "incomplete"
)
interim_msg["_thinking_prefill"] = True
messages.append(interim_msg)
self._session_messages = messages
self._save_session_log(messages)
continue
# Exhausted prefill attempts or no structured
# reasoning — fall through to "(empty)" terminal.
reasoning_text = self._extract_reasoning(assistant_message)
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
assistant_msg["content"] = "(empty)"
@@ -8871,6 +8909,7 @@ class AIAgent:
if hasattr(self, '_empty_content_retries'):
self._empty_content_retries = 0
self._last_empty_content_signature = None
self._thinking_prefill_retries = 0
if (
self.api_mode == "codex_responses"
@@ -8909,7 +8948,18 @@ class AIAgent:
final_response = self._strip_think_blocks(final_response).strip()
final_msg = self._build_assistant_message(assistant_message, finish_reason)
# Pop thinking-only prefill message(s) before appending
# the final response. This avoids consecutive assistant
# messages which break strict-alternation providers
# (Anthropic Messages API) and keeps history clean.
while (
messages
and isinstance(messages[-1], dict)
and messages[-1].get("_thinking_prefill")
):
messages.pop()
messages.append(final_msg)
if not self.quiet_mode:

View File

@@ -1547,7 +1547,7 @@ class TestRunConversation:
assert any(m.get("reasoning") for m in assistant_msgs)
def test_reasoning_only_local_resumed_no_compression_triggered(self, agent):
"""Reasoning-only responses no longer trigger compression — accepted immediately."""
"""Reasoning-only responses no longer trigger compression — prefill then accepted."""
self._setup_agent(agent)
agent.base_url = "http://127.0.0.1:1234/v1"
agent.compression_enabled = True
@@ -1561,8 +1561,9 @@ class TestRunConversation:
{"role": "assistant", "content": "old answer"},
]
# 3 responses: original + 2 prefill continuations (structured reasoning triggers prefill)
with (
patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp]),
patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, empty_resp, empty_resp]),
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
@@ -1573,17 +1574,18 @@ class TestRunConversation:
mock_compress.assert_not_called() # no compression triggered
assert result["completed"] is True
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1
assert result["api_calls"] == 3 # 1 original + 2 prefill continuations
def test_reasoning_only_response_accepted_without_retry(self, agent):
"""Reasoning-only response should be accepted with (empty) content, no retries."""
def test_reasoning_only_response_prefill_then_empty(self, agent):
"""Structured reasoning-only triggers prefill continuation (up to 2), then falls through to (empty)."""
self._setup_agent(agent)
empty_resp = _mock_response(
content=None,
finish_reason="stop",
reasoning_content="structured reasoning answer",
)
agent.client.chat.completions.create.side_effect = [empty_resp]
# 3 responses: original + 2 prefill continuations, all reasoning-only
agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
@@ -1592,7 +1594,35 @@ class TestRunConversation:
result = agent.run_conversation("answer me")
assert result["completed"] is True
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 # no retries
assert result["api_calls"] == 3 # 1 original + 2 prefill continuations
def test_reasoning_only_prefill_succeeds_on_continuation(self, agent):
"""When prefill continuation produces content, it becomes the final response."""
self._setup_agent(agent)
empty_resp = _mock_response(
content=None,
finish_reason="stop",
reasoning_content="structured reasoning answer",
)
content_resp = _mock_response(
content="Here is the actual answer.",
finish_reason="stop",
)
agent.client.chat.completions.create.side_effect = [empty_resp, content_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("answer me")
assert result["completed"] is True
assert result["final_response"] == "Here is the actual answer."
assert result["api_calls"] == 2 # 1 original + 1 prefill continuation
# Prefill message should be cleaned up — no consecutive assistant messages
roles = [m.get("role") for m in result["messages"]]
for i in range(len(roles) - 1):
if roles[i] == "assistant" and roles[i + 1] == "assistant":
raise AssertionError("Consecutive assistant messages found in history")
def test_truly_empty_response_accepted_without_retry(self, agent):
"""Truly empty response (no content, no reasoning) should still complete with (empty)."""