diff --git a/run_agent.py b/run_agent.py index 6a0eb7442..dd3d1156a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -6340,6 +6340,62 @@ class AIAgent: if finish_reason == "length": self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) + # ── Detect thinking-budget exhaustion ────────────── + # When the model spends ALL output tokens on reasoning + # and has none left for the response, continuation + # retries are pointless. Detect this early and give a + # targeted error instead of wasting 3 API calls. + _trunc_content = None + if self.api_mode == "chat_completions": + _trunc_msg = response.choices[0].message if (hasattr(response, "choices") and response.choices) else None + _trunc_content = getattr(_trunc_msg, "content", None) if _trunc_msg else None + elif self.api_mode == "anthropic_messages": + # Anthropic response.content is a list of blocks + _text_parts = [] + for _blk in getattr(response, "content", []): + if getattr(_blk, "type", None) == "text": + _text_parts.append(getattr(_blk, "text", "")) + _trunc_content = "\n".join(_text_parts) if _text_parts else None + + _thinking_exhausted = ( + _trunc_content is not None + and not self._has_content_after_think_block(_trunc_content) + ) or _trunc_content is None + + if _thinking_exhausted: + _exhaust_error = ( + "Model used all output tokens on reasoning with none left " + "for the response. Try lowering reasoning effort or " + "increasing max_tokens." + ) + self._vprint( + f"{self.log_prefix}💭 Reasoning exhausted the output token budget — " + f"no visible response was produced.", + force=True, + ) + # Return a user-friendly message as the response so + # CLI (response box) and gateway (chat message) both + # display it naturally instead of a suppressed error. + _exhaust_response = ( + "⚠️ **Thinking Budget Exhausted**\n\n" + "The model used all its output tokens on reasoning " + "and had none left for the actual response.\n\n" + "To fix this:\n" + "→ Lower reasoning effort: `/thinkon low` or `/thinkon minimal`\n" + "→ Increase the output token limit: " + "set `model.max_tokens` in config.yaml" + ) + self._cleanup_task_resources(effective_task_id) + self._persist_session(messages, conversation_history) + return { + "final_response": _exhaust_response, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": _exhaust_error, + } + if self.api_mode == "chat_completions": assistant_message = response.choices[0].message if not assistant_message.tool_calls: diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index b6aaedf72..4a3537e9b 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -1372,19 +1372,11 @@ class TestRunConversation: assert result["final_response"] == "Recovered after compression" assert result["completed"] is True - @pytest.mark.parametrize( - ("first_content", "second_content", "expected_final"), - [ - ("Part 1 ", "Part 2", "Part 1 Part 2"), - ("internal reasoning", "Recovered final answer", "Recovered final answer"), - ], - ) - def test_length_finish_reason_requests_continuation( - self, agent, first_content, second_content, expected_final - ): + def test_length_finish_reason_requests_continuation(self, agent): + """Normal truncation (partial real content) triggers continuation.""" self._setup_agent(agent) - first = _mock_response(content=first_content, finish_reason="length") - second = _mock_response(content=second_content, finish_reason="stop") + first = _mock_response(content="Part 1 ", finish_reason="length") + second = _mock_response(content="Part 2", finish_reason="stop") agent.client.chat.completions.create.side_effect = [first, second] with ( @@ -1396,12 +1388,58 @@ class TestRunConversation: assert result["completed"] is True assert result["api_calls"] == 2 - assert result["final_response"] == expected_final + assert result["final_response"] == "Part 1 Part 2" second_call_messages = agent.client.chat.completions.create.call_args_list[1].kwargs["messages"] assert second_call_messages[-1]["role"] == "user" assert "truncated by the output length limit" in second_call_messages[-1]["content"] + def test_length_thinking_exhausted_skips_continuation(self, agent): + """When finish_reason='length' but content is only thinking, skip retries.""" + self._setup_agent(agent) + resp = _mock_response( + content="internal reasoning", + finish_reason="length", + ) + agent.client.chat.completions.create.return_value = resp + + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello") + + # Should return immediately — no continuation, only 1 API call + assert result["completed"] is False + assert result["api_calls"] == 1 + assert "reasoning" in result["error"].lower() + assert "output tokens" in result["error"].lower() + # Should have a user-friendly response (not None) + assert result["final_response"] is not None + assert "Thinking Budget Exhausted" in result["final_response"] + assert "/thinkon" in result["final_response"] + + def test_length_empty_content_detected_as_thinking_exhausted(self, agent): + """When finish_reason='length' and content is None/empty, detect exhaustion.""" + self._setup_agent(agent) + resp = _mock_response(content=None, finish_reason="length") + agent.client.chat.completions.create.return_value = resp + + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello") + + assert result["completed"] is False + assert result["api_calls"] == 1 + assert "reasoning" in result["error"].lower() + # User-friendly message is returned + assert result["final_response"] is not None + assert "Thinking Budget Exhausted" in result["final_response"] + class TestRetryExhaustion: """Regression: retry_count > max_retries was dead code (off-by-one).