diff --git a/run_agent.py b/run_agent.py index b79a1578a..8ac361594 100644 --- a/run_agent.py +++ b/run_agent.py @@ -3284,7 +3284,7 @@ class AIAgent: api_messages = [] for msg in messages: api_msg = msg.copy() - + # For ALL assistant messages, pass reasoning back to the API # This ensures multi-turn reasoning context is preserved if msg.get("role") == "assistant": @@ -3292,7 +3292,7 @@ class AIAgent: if reasoning_text: # Add reasoning_content for API compatibility (Moonshot AI, Novita, OpenRouter) api_msg["reasoning_content"] = reasoning_text - + # Remove 'reasoning' field - it's for trajectory storage only # We've copied it to 'reasoning_content' for the API above if "reasoning" in api_msg: @@ -3303,7 +3303,7 @@ class AIAgent: # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context # The signature field helps maintain reasoning continuity api_messages.append(api_msg) - + # Build the final system message: cached prompt + ephemeral system prompt. # The ephemeral part is appended here (not baked into the cached prompt) # so it stays out of the session DB and logs. @@ -3316,21 +3316,21 @@ class AIAgent: effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() if effective_system: api_messages = [{"role": "system", "content": effective_system}] + api_messages - + # Inject ephemeral prefill messages right after the system prompt # but before conversation history. Same API-call-time-only pattern. if self.prefill_messages: sys_offset = 1 if effective_system else 0 for idx, pfm in enumerate(self.prefill_messages): api_messages.insert(sys_offset + idx, pfm.copy()) - + # Apply Anthropic prompt caching for Claude models via OpenRouter. # Auto-detected: if model name contains "claude" and base_url is OpenRouter, # inject cache_control breakpoints (system + last 3 messages) to reduce # input token costs by ~75% on multi-turn conversations. if self._use_prompt_caching: api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl) - + # Safety net: strip orphaned tool results / add stubs for missing # results before sending to the API. The compressor handles this # during compression, but orphans can also sneak in from session @@ -3374,6 +3374,7 @@ class AIAgent: max_compression_attempts = 3 codex_auth_retry_attempted = False nous_auth_retry_attempted = False + restart_with_compressed_messages = False finish_reason = "stop" response = None # Guard against UnboundLocalError if all retries fail @@ -3707,7 +3708,8 @@ class AIAgent: if len(messages) < original_len: print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") time.sleep(2) # Brief pause between compression retries - continue # Retry with compressed messages + restart_with_compressed_messages = True + break else: print(f"{self.log_prefix}❌ Payload too large and cannot compress further.") logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") @@ -3775,7 +3777,8 @@ class AIAgent: if len(messages) < original_len: print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") time.sleep(2) # Brief pause between compression retries - continue # Retry with compressed messages or new tier + restart_with_compressed_messages = True + break else: # Can't compress further and already at minimum tier print(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.") @@ -3862,6 +3865,11 @@ class AIAgent: if interrupted: break + if restart_with_compressed_messages: + api_call_count -= 1 + self.iteration_budget.refund() + continue + # Guard: if all retries exhausted without a successful response # (e.g. repeated context-length errors that exhausted retry_count), # the `response` variable is still None. Break out cleanly. diff --git a/tests/test_413_compression.py b/tests/test_413_compression.py index 744fe41f1..62fee8b8e 100644 --- a/tests/test_413_compression.py +++ b/tests/test_413_compression.py @@ -234,6 +234,55 @@ class TestHTTP413Compression: mock_compress.assert_called_once() assert result["completed"] is True + def test_context_length_retry_rebuilds_request_after_compression(self, agent): + """Retry must send the compressed transcript, not the stale oversized payload.""" + err_400 = Exception( + "Error code: 400 - {'error': {'message': " + "\"This endpoint's maximum context length is 128000 tokens. " + "Please reduce the length of the messages.\"}}" + ) + err_400.status_code = 400 + ok_resp = _mock_response(content="Recovered after real compression", finish_reason="stop") + + request_payloads = [] + + def _side_effect(**kwargs): + request_payloads.append(kwargs) + if len(request_payloads) == 1: + raise err_400 + return ok_resp + + agent.client.chat.completions.create.side_effect = _side_effect + + prefill = [ + {"role": "user", "content": "previous question"}, + {"role": "assistant", "content": "previous answer"}, + ] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "compressed summary"}], + "compressed prompt", + ) + result = agent.run_conversation("hello", conversation_history=prefill) + + assert result["completed"] is True + assert len(request_payloads) == 2 + assert len(request_payloads[1]["messages"]) < len(request_payloads[0]["messages"]) + assert request_payloads[1]["messages"][0] == { + "role": "system", + "content": "compressed prompt", + } + assert request_payloads[1]["messages"][1] == { + "role": "user", + "content": "compressed summary", + } + def test_413_cannot_compress_further(self, agent): """When compression can't reduce messages, return partial result.""" err_413 = _make_413_error()