feat: thinking-only prefill continuation for structured reasoning responses (#5931)

When the model produces structured reasoning (via API fields like .reasoning, .reasoning_content, .reasoning_details) but no visible text content, append the assistant message as prefill and continue the loop. The model sees its own reasoning context on the next turn and produces the text portion. Inspired by clawdbot's 'incomplete-text' recovery pattern. Up to 2 prefill attempts before falling through to the existing '(empty)' terminal. Key design decisions: - Only triggers for structured reasoning (API fields), NOT inline <think> tags - Prefill messages are popped on success to maintain strict role alternation - _thinking_prefill marker stripped from all API message building paths - Works across all providers: OpenAI (continuation), Anthropic (native prefill) Verified with E2E tests: simulated thinking-only → real OpenRouter continuation produces correct content. Also confirmed Qwen models consistently produce structured-reasoning-only responses under token pressure.
2026-04-07 13:19:06 -07:00
parent 6e2f6a25a1
commit ab8f9c089e
2 changed files with 94 additions and 14 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -5741,6 +5741,7 @@ class AIAgent:
                api_msg.pop("reasoning", None)
                api_msg.pop("finish_reason", None)
                api_msg.pop("_flush_sentinel", None)
+                api_msg.pop("_thinking_prefill", None)
                if _needs_sanitize:
                    self._sanitize_tool_calls_for_strict_api(api_msg)
                api_messages.append(api_msg)
@@ -6664,7 +6665,7 @@ class AIAgent:
            api_messages = []
            for msg in messages:
                api_msg = msg.copy()
-                for internal_field in ("reasoning", "finish_reason"):
+                for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
                    api_msg.pop(internal_field, None)
                if _needs_sanitize:
                    self._sanitize_tool_calls_for_strict_api(api_msg)
@@ -6856,6 +6857,7 @@ class AIAgent:
        self._empty_content_retries = 0
        self._incomplete_scratchpad_retries = 0
        self._codex_incomplete_retries = 0
+        self._thinking_prefill_retries = 0
        self._last_content_with_tools = None
        self._mute_post_response = False
        self._surrogate_sanitized = False
@@ -7201,6 +7203,8 @@ class AIAgent:
                # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
                if "finish_reason" in api_msg:
                    api_msg.pop("finish_reason")
+                # Strip internal thinking-prefill marker
+                api_msg.pop("_thinking_prefill", None)
                # Strip Codex Responses API fields (call_id, response_item_id) for
                # strict providers like Mistral, Fireworks, etc. that reject unknown fields.
                # Uses new dicts so the internal messages list retains the fields
@@ -8735,6 +8739,15 @@ class AIAgent:
                            if clean:
                                self._vprint(f"  ┊ 💬 {clean}")
                    
+                    # Pop thinking-only prefill message(s) before appending
+                    # (tool-call path — same rationale as the final-response path).
+                    while (
+                        messages
+                        and isinstance(messages[-1], dict)
+                        and messages[-1].get("_thinking_prefill")
+                    ):
+                        messages.pop()
+
                    messages.append(assistant_msg)

                    # Close any open streaming display (response box, reasoning
@@ -8848,11 +8861,36 @@ class AIAgent:
                            self._response_was_previewed = True
                            break

-                        # Reasoning-only response: the model produced thinking
-                        # but no visible content.  This is a valid response —
-                        # keep reasoning in its own field and set content to
-                        # "(empty)" so every provider accepts the message.
-                        # No retries needed.
+                        # ── Thinking-only prefill continuation ──────────
+                        # The model produced structured reasoning (via API
+                        # fields) but no visible text content.  Rather than
+                        # giving up, append the assistant message as-is and
+                        # continue — the model will see its own reasoning
+                        # on the next turn and produce the text portion.
+                        # Inspired by clawdbot's "incomplete-text" recovery.
+                        _has_structured = bool(
+                            getattr(assistant_message, "reasoning", None)
+                            or getattr(assistant_message, "reasoning_content", None)
+                            or getattr(assistant_message, "reasoning_details", None)
+                        )
+                        if _has_structured and self._thinking_prefill_retries < 2:
+                            self._thinking_prefill_retries += 1
+                            self._vprint(
+                                f"{self.log_prefix}↻ Thinking-only response — "
+                                f"prefilling to continue "
+                                f"({self._thinking_prefill_retries}/2)"
+                            )
+                            interim_msg = self._build_assistant_message(
+                                assistant_message, "incomplete"
+                            )
+                            interim_msg["_thinking_prefill"] = True
+                            messages.append(interim_msg)
+                            self._session_messages = messages
+                            self._save_session_log(messages)
+                            continue
+
+                        # Exhausted prefill attempts or no structured
+                        # reasoning — fall through to "(empty)" terminal.
                        reasoning_text = self._extract_reasoning(assistant_message)
                        assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
                        assistant_msg["content"] = "(empty)"
@@ -8871,6 +8909,7 @@ class AIAgent:
                    if hasattr(self, '_empty_content_retries'):
                        self._empty_content_retries = 0
                    self._last_empty_content_signature = None
+                    self._thinking_prefill_retries = 0

                    if (
                        self.api_mode == "codex_responses"
@@ -8909,7 +8948,18 @@ class AIAgent:
                    final_response = self._strip_think_blocks(final_response).strip()
                    
                    final_msg = self._build_assistant_message(assistant_message, finish_reason)
-                    
+
+                    # Pop thinking-only prefill message(s) before appending
+                    # the final response.  This avoids consecutive assistant
+                    # messages which break strict-alternation providers
+                    # (Anthropic Messages API) and keeps history clean.
+                    while (
+                        messages
+                        and isinstance(messages[-1], dict)
+                        and messages[-1].get("_thinking_prefill")
+                    ):
+                        messages.pop()
+
                    messages.append(final_msg)
                    
                    if not self.quiet_mode:
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1547,7 +1547,7 @@ class TestRunConversation:
        assert any(m.get("reasoning") for m in assistant_msgs)

    def test_reasoning_only_local_resumed_no_compression_triggered(self, agent):
-        """Reasoning-only responses no longer trigger compression — accepted immediately."""
+        """Reasoning-only responses no longer trigger compression — prefill then accepted."""
        self._setup_agent(agent)
        agent.base_url = "http://127.0.0.1:1234/v1"
        agent.compression_enabled = True
@@ -1561,8 +1561,9 @@ class TestRunConversation:
            {"role": "assistant", "content": "old answer"},
        ]

+        # 3 responses: original + 2 prefill continuations (structured reasoning triggers prefill)
        with (
-            patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp]),
+            patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, empty_resp, empty_resp]),
            patch.object(agent, "_compress_context") as mock_compress,
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
@@ -1573,17 +1574,18 @@ class TestRunConversation:
        mock_compress.assert_not_called()  # no compression triggered
        assert result["completed"] is True
        assert result["final_response"] == "(empty)"
-        assert result["api_calls"] == 1
+        assert result["api_calls"] == 3  # 1 original + 2 prefill continuations

-    def test_reasoning_only_response_accepted_without_retry(self, agent):
-        """Reasoning-only response should be accepted with (empty) content, no retries."""
+    def test_reasoning_only_response_prefill_then_empty(self, agent):
+        """Structured reasoning-only triggers prefill continuation (up to 2), then falls through to (empty)."""
        self._setup_agent(agent)
        empty_resp = _mock_response(
            content=None,
            finish_reason="stop",
            reasoning_content="structured reasoning answer",
        )
-        agent.client.chat.completions.create.side_effect = [empty_resp]
+        # 3 responses: original + 2 prefill continuations, all reasoning-only
+        agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp]
        with (
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
@@ -1592,7 +1594,35 @@ class TestRunConversation:
            result = agent.run_conversation("answer me")
        assert result["completed"] is True
        assert result["final_response"] == "(empty)"
-        assert result["api_calls"] == 1  # no retries
+        assert result["api_calls"] == 3  # 1 original + 2 prefill continuations
+
+    def test_reasoning_only_prefill_succeeds_on_continuation(self, agent):
+        """When prefill continuation produces content, it becomes the final response."""
+        self._setup_agent(agent)
+        empty_resp = _mock_response(
+            content=None,
+            finish_reason="stop",
+            reasoning_content="structured reasoning answer",
+        )
+        content_resp = _mock_response(
+            content="Here is the actual answer.",
+            finish_reason="stop",
+        )
+        agent.client.chat.completions.create.side_effect = [empty_resp, content_resp]
+        with (
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("answer me")
+        assert result["completed"] is True
+        assert result["final_response"] == "Here is the actual answer."
+        assert result["api_calls"] == 2  # 1 original + 1 prefill continuation
+        # Prefill message should be cleaned up — no consecutive assistant messages
+        roles = [m.get("role") for m in result["messages"]]
+        for i in range(len(roles) - 1):
+            if roles[i] == "assistant" and roles[i + 1] == "assistant":
+                raise AssertionError("Consecutive assistant messages found in history")

    def test_truly_empty_response_accepted_without_retry(self, agent):
        """Truly empty response (no content, no reasoning) should still complete with (empty)."""