diff --git a/run_agent.py b/run_agent.py index 90575b3bb..ab0d14194 100644 --- a/run_agent.py +++ b/run_agent.py @@ -85,7 +85,7 @@ from agent.model_metadata import ( fetch_model_metadata, estimate_tokens_rough, estimate_messages_tokens_rough, estimate_request_tokens_rough, get_next_probe_tier, parse_context_limit_from_error, - save_context_length, + save_context_length, is_local_endpoint, ) from agent.context_compressor import ContextCompressor from agent.prompt_caching import apply_anthropic_cache_control @@ -1565,6 +1565,74 @@ class AIAgent: return "\n\n".join(reasoning_parts) return None + + def _classify_empty_content_response( + self, + assistant_message, + *, + finish_reason: Optional[str], + approx_tokens: int, + api_messages: List[Dict[str, Any]], + conversation_history: Optional[List[Dict[str, Any]]], + ) -> Dict[str, Any]: + """Classify think-only/empty responses so we can retry, compress, or salvage. + + We intentionally do NOT short-circuit all structured-reasoning responses. + Prior discussion/PR history shows some models recover on retry. Instead we: + - compress immediately when the pattern looks like implicit context pressure + - salvage reasoning early when the same reasoning-only payload repeats + - otherwise preserve the normal retry path + """ + reasoning_text = self._extract_reasoning(assistant_message) + has_structured_reasoning = bool( + getattr(assistant_message, "reasoning", None) + or getattr(assistant_message, "reasoning_content", None) + or getattr(assistant_message, "reasoning_details", None) + ) + content = getattr(assistant_message, "content", None) or "" + stripped_content = self._strip_think_blocks(content).strip() + signature = ( + content, + reasoning_text or "", + bool(has_structured_reasoning), + finish_reason or "", + ) + repeated_signature = signature == getattr(self, "_last_empty_content_signature", None) + + compressor = getattr(self, "context_compressor", None) + ctx_len = getattr(compressor, "context_length", 0) or 0 + threshold_tokens = getattr(compressor, "threshold_tokens", 0) or 0 + is_large_session = bool( + (ctx_len and approx_tokens >= max(int(ctx_len * 0.4), threshold_tokens)) + or len(api_messages) > 80 + ) + is_local_custom = is_local_endpoint(getattr(self, "base_url", "") or "") + is_resumed = bool(conversation_history) + context_pressure_signals = any( + [ + finish_reason == "length", + getattr(compressor, "_context_probed", False), + is_large_session, + is_resumed, + ] + ) + should_compress = bool( + self.compression_enabled + and is_local_custom + and context_pressure_signals + and not stripped_content + ) + + self._last_empty_content_signature = signature + return { + "reasoning_text": reasoning_text, + "has_structured_reasoning": has_structured_reasoning, + "repeated_signature": repeated_signature, + "should_compress": should_compress, + "is_local_custom": is_local_custom, + "is_large_session": is_large_session, + "is_resumed": is_resumed, + } def _cleanup_task_resources(self, task_id: str) -> None: """Clean up VM and browser resources for a given task.""" @@ -8406,13 +8474,22 @@ class AIAgent: self._response_was_previewed = True break - # No fallback available — this is a genuine empty response. - # Retry in case the model just had a bad generation. + # No fallback available — classify the empty response before + # blindly spending retries. Some local/custom backends surface + # implicit context pressure as reasoning-only output rather than + # an explicit overflow error. if not hasattr(self, '_empty_content_retries'): self._empty_content_retries = 0 self._empty_content_retries += 1 - - reasoning_text = self._extract_reasoning(assistant_message) + + empty_response_info = self._classify_empty_content_response( + assistant_message, + finish_reason=finish_reason, + approx_tokens=approx_tokens, + api_messages=api_messages, + conversation_history=conversation_history, + ) + reasoning_text = empty_response_info["reasoning_text"] self._vprint(f"{self.log_prefix}⚠️ Response only contains think block with no content after it") if reasoning_text: reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text @@ -8420,6 +8497,45 @@ class AIAgent: else: content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response self._vprint(f"{self.log_prefix} Content: '{content_preview}'") + + if empty_response_info["should_compress"]: + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + self._vprint(f"{self.log_prefix} 💡 Local/custom backend returned reasoning-only output with no visible content. This often means the resumed/large session exceeds the runtime context window. Try /new or lower model.context_length to the actual runtime limit.", force=True) + else: + self._vprint(f"{self.log_prefix}🗜️ Reasoning-only response looks like implicit context pressure — attempting compression ({compression_attempts}/{max_compression_attempts})...", force=True) + original_len = len(messages) + messages, active_system_prompt = self._compress_context( + messages, system_message, approx_tokens=approx_tokens, + task_id=effective_task_id, + ) + if len(messages) < original_len: + conversation_history = None + self._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages after reasoning-only response, retrying...") + time.sleep(2) + api_call_count -= 1 + self.iteration_budget.refund() + retry_count += 1 + continue + self._vprint(f"{self.log_prefix} Compression could not shrink the session; falling back to retry/salvage logic.") + + if ( + reasoning_text + and empty_response_info["repeated_signature"] + and empty_response_info["has_structured_reasoning"] + ): + self._vprint(f"{self.log_prefix}ℹ️ Structured reasoning-only response repeated unchanged — using reasoning text directly.", force=True) + self._empty_content_retries = 0 + final_response = reasoning_text + empty_msg = { + "role": "assistant", + "content": final_response, + "reasoning": reasoning_text, + "finish_reason": finish_reason, + } + messages.append(empty_msg) + break if self._empty_content_retries < 3: self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") @@ -8476,18 +8592,27 @@ class AIAgent: self._cleanup_task_resources(effective_task_id) self._persist_session(messages, conversation_history) + error_message = "Model generated only think blocks with no actual response after 3 retries" + if empty_response_info["is_local_custom"]: + error_message = ( + "Local/custom backend returned reasoning-only output with no visible response after 3 retries. " + "Likely causes: wrong /v1 endpoint, runtime context window smaller than Hermes expects, " + "or a resumed/large session exceeding the backend's actual context limit." + ) + return { "final_response": final_response or None, "messages": messages, "api_calls": api_call_count, "completed": False, "partial": True, - "error": "Model generated only think blocks with no actual response after 3 retries" + "error": error_message } - # Reset retry counter on successful content + # Reset retry counter/signature on successful content if hasattr(self, '_empty_content_retries'): self._empty_content_retries = 0 + self._last_empty_content_signature = None if ( self.api_mode == "codex_responses" diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index 88667c215..a6281b4ab 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -170,13 +170,21 @@ def _mock_tool_call(name="web_search", arguments="{}", call_id=None): def _mock_response( - content="Hello", finish_reason="stop", tool_calls=None, reasoning=None, usage=None + content="Hello", + finish_reason="stop", + tool_calls=None, + reasoning=None, + reasoning_content=None, + reasoning_details=None, + usage=None, ): """Return a SimpleNamespace mimicking an OpenAI ChatCompletion response.""" msg = _mock_assistant_msg( content=content, tool_calls=tool_calls, reasoning=reasoning, + reasoning_content=reasoning_content, + reasoning_details=reasoning_details, ) choice = SimpleNamespace(message=msg, finish_reason=finish_reason) resp = SimpleNamespace(choices=[choice], model="test/model") @@ -1498,6 +1506,75 @@ class TestRunConversation: assert result["completed"] is True assert result["final_response"] == "internal reasoning" + def test_empty_content_local_resumed_session_triggers_compression(self, agent): + """Local resumed reasoning-only responses should compress before burning retries.""" + self._setup_agent(agent) + agent.base_url = "http://127.0.0.1:1234/v1" + agent.compression_enabled = True + empty_resp = _mock_response( + content=None, + finish_reason="stop", + reasoning_content="reasoning only", + ) + ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop") + prefill = [ + {"role": "user", "content": "old question"}, + {"role": "assistant", "content": "old answer"}, + ] + + with ( + patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, ok_resp]), + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "compressed user message"}], + "compressed system prompt", + ) + result = agent.run_conversation("hello", conversation_history=prefill) + + mock_compress.assert_called_once() + assert result["completed"] is True + assert result["final_response"] == "Recovered after compression" + assert result["api_calls"] == 1 # compression retry is refunded, same as explicit overflow path + + def test_empty_content_repeated_structured_reasoning_salvages_early(self, agent): + """Repeated identical structured reasoning-only responses should stop retrying early.""" + self._setup_agent(agent) + empty_resp = _mock_response( + content=None, + finish_reason="stop", + reasoning_content="structured reasoning answer", + ) + agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp] + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("answer me") + assert result["completed"] is True + assert result["final_response"] == "structured reasoning answer" + assert result["api_calls"] == 2 + + def test_empty_content_local_custom_error_is_actionable(self, agent): + """Local/custom retries should return a diagnostic tailored to context/endpoint mismatch.""" + self._setup_agent(agent) + agent.base_url = "http://127.0.0.1:1234/v1" + empty_resp = _mock_response(content=None, finish_reason="stop") + agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp] + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("answer me") + assert result["completed"] is False + assert "Local/custom backend returned reasoning-only output" in result["error"] + assert "wrong /v1 endpoint" in result["error"] + def test_nous_401_refreshes_after_remint_and_retries(self, agent): self._setup_agent(agent) agent.provider = "nous"