diff --git a/run_agent.py b/run_agent.py
index 6a0eb7442..dd3d1156a 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -6340,6 +6340,62 @@ class AIAgent:
if finish_reason == "length":
self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True)
+ # ── Detect thinking-budget exhaustion ──────────────
+ # When the model spends ALL output tokens on reasoning
+ # and has none left for the response, continuation
+ # retries are pointless. Detect this early and give a
+ # targeted error instead of wasting 3 API calls.
+ _trunc_content = None
+ if self.api_mode == "chat_completions":
+ _trunc_msg = response.choices[0].message if (hasattr(response, "choices") and response.choices) else None
+ _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None
+ elif self.api_mode == "anthropic_messages":
+ # Anthropic response.content is a list of blocks
+ _text_parts = []
+ for _blk in getattr(response, "content", []):
+ if getattr(_blk, "type", None) == "text":
+ _text_parts.append(getattr(_blk, "text", ""))
+ _trunc_content = "\n".join(_text_parts) if _text_parts else None
+
+ _thinking_exhausted = (
+ _trunc_content is not None
+ and not self._has_content_after_think_block(_trunc_content)
+ ) or _trunc_content is None
+
+ if _thinking_exhausted:
+ _exhaust_error = (
+ "Model used all output tokens on reasoning with none left "
+ "for the response. Try lowering reasoning effort or "
+ "increasing max_tokens."
+ )
+ self._vprint(
+ f"{self.log_prefix}💭 Reasoning exhausted the output token budget — "
+ f"no visible response was produced.",
+ force=True,
+ )
+ # Return a user-friendly message as the response so
+ # CLI (response box) and gateway (chat message) both
+ # display it naturally instead of a suppressed error.
+ _exhaust_response = (
+ "⚠️ **Thinking Budget Exhausted**\n\n"
+ "The model used all its output tokens on reasoning "
+ "and had none left for the actual response.\n\n"
+ "To fix this:\n"
+ "→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n"
+ "→ Increase the output token limit: "
+ "set `model.max_tokens` in config.yaml"
+ )
+ self._cleanup_task_resources(effective_task_id)
+ self._persist_session(messages, conversation_history)
+ return {
+ "final_response": _exhaust_response,
+ "messages": messages,
+ "api_calls": api_call_count,
+ "completed": False,
+ "partial": True,
+ "error": _exhaust_error,
+ }
+
if self.api_mode == "chat_completions":
assistant_message = response.choices[0].message
if not assistant_message.tool_calls:
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index b6aaedf72..4a3537e9b 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1372,19 +1372,11 @@ class TestRunConversation:
assert result["final_response"] == "Recovered after compression"
assert result["completed"] is True
- @pytest.mark.parametrize(
- ("first_content", "second_content", "expected_final"),
- [
- ("Part 1 ", "Part 2", "Part 1 Part 2"),
- ("internal reasoning", "Recovered final answer", "Recovered final answer"),
- ],
- )
- def test_length_finish_reason_requests_continuation(
- self, agent, first_content, second_content, expected_final
- ):
+ def test_length_finish_reason_requests_continuation(self, agent):
+ """Normal truncation (partial real content) triggers continuation."""
self._setup_agent(agent)
- first = _mock_response(content=first_content, finish_reason="length")
- second = _mock_response(content=second_content, finish_reason="stop")
+ first = _mock_response(content="Part 1 ", finish_reason="length")
+ second = _mock_response(content="Part 2", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [first, second]
with (
@@ -1396,12 +1388,58 @@ class TestRunConversation:
assert result["completed"] is True
assert result["api_calls"] == 2
- assert result["final_response"] == expected_final
+ assert result["final_response"] == "Part 1 Part 2"
second_call_messages = agent.client.chat.completions.create.call_args_list[1].kwargs["messages"]
assert second_call_messages[-1]["role"] == "user"
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
+ def test_length_thinking_exhausted_skips_continuation(self, agent):
+ """When finish_reason='length' but content is only thinking, skip retries."""
+ self._setup_agent(agent)
+ resp = _mock_response(
+ content="internal reasoning",
+ finish_reason="length",
+ )
+ agent.client.chat.completions.create.return_value = resp
+
+ with (
+ patch.object(agent, "_persist_session"),
+ patch.object(agent, "_save_trajectory"),
+ patch.object(agent, "_cleanup_task_resources"),
+ ):
+ result = agent.run_conversation("hello")
+
+ # Should return immediately — no continuation, only 1 API call
+ assert result["completed"] is False
+ assert result["api_calls"] == 1
+ assert "reasoning" in result["error"].lower()
+ assert "output tokens" in result["error"].lower()
+ # Should have a user-friendly response (not None)
+ assert result["final_response"] is not None
+ assert "Thinking Budget Exhausted" in result["final_response"]
+ assert "/thinkon" in result["final_response"]
+
+ def test_length_empty_content_detected_as_thinking_exhausted(self, agent):
+ """When finish_reason='length' and content is None/empty, detect exhaustion."""
+ self._setup_agent(agent)
+ resp = _mock_response(content=None, finish_reason="length")
+ agent.client.chat.completions.create.return_value = resp
+
+ with (
+ patch.object(agent, "_persist_session"),
+ patch.object(agent, "_save_trajectory"),
+ patch.object(agent, "_cleanup_task_resources"),
+ ):
+ result = agent.run_conversation("hello")
+
+ assert result["completed"] is False
+ assert result["api_calls"] == 1
+ assert "reasoning" in result["error"].lower()
+ # User-friendly message is returned
+ assert result["final_response"] is not None
+ assert "Thinking Budget Exhausted" in result["final_response"]
+
class TestRetryExhaustion:
"""Regression: retry_count > max_retries was dead code (off-by-one).