diff --git a/run_agent.py b/run_agent.py index aef2fb884..62d641b7b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2862,6 +2862,51 @@ class AIAgent: active_system_prompt = self._cached_system_prompt + # ── Preflight context compression ── + # Before entering the main loop, check if the loaded conversation + # history already exceeds the model's context threshold. This handles + # cases where a user switches to a model with a smaller context window + # while having a large existing session — compress proactively rather + # than waiting for an API error (which might be caught as a non-retryable + # 4xx and abort the request entirely). + if ( + self.compression_enabled + and len(messages) > self.context_compressor.protect_first_n + + self.context_compressor.protect_last_n + 1 + ): + _sys_tok_est = estimate_tokens_rough(active_system_prompt or "") + _msg_tok_est = estimate_messages_tokens_rough(messages) + _preflight_tokens = _sys_tok_est + _msg_tok_est + + if _preflight_tokens >= self.context_compressor.threshold_tokens: + logger.info( + "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)", + f"{_preflight_tokens:,}", + f"{self.context_compressor.threshold_tokens:,}", + self.model, + f"{self.context_compressor.context_length:,}", + ) + if not self.quiet_mode: + print( + f"📦 Preflight compression: ~{_preflight_tokens:,} tokens " + f">= {self.context_compressor.threshold_tokens:,} threshold" + ) + # May need multiple passes for very large sessions with small + # context windows (each pass summarises the middle N turns). + for _pass in range(3): + _orig_len = len(messages) + messages, active_system_prompt = self._compress_context( + messages, system_message, approx_tokens=_preflight_tokens + ) + if len(messages) >= _orig_len: + break # Cannot compress further + # Re-estimate after compression + _sys_tok_est = estimate_tokens_rough(active_system_prompt or "") + _msg_tok_est = estimate_messages_tokens_rough(messages) + _preflight_tokens = _sys_tok_est + _msg_tok_est + if _preflight_tokens < self.context_compressor.threshold_tokens: + break # Under threshold + # Main conversation loop api_call_count = 0 final_response = None @@ -3287,37 +3332,10 @@ class AIAgent: "partial": True } - # Check for non-retryable client errors (4xx HTTP status codes). - # These indicate a problem with the request itself (bad model ID, - # invalid API key, forbidden, etc.) and will never succeed on retry. - # Note: 413 is excluded — it's handled above via compression. - is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413 - is_client_error = is_client_status_error or any(phrase in error_msg for phrase in [ - 'error code: 400', 'error code: 401', 'error code: 403', - 'error code: 404', 'error code: 422', - 'is not a valid model', 'invalid model', 'model not found', - 'invalid api key', 'invalid_api_key', 'authentication', - 'unauthorized', 'forbidden', 'not found', - ]) - - if is_client_error: - self._dump_api_request_debug( - api_kwargs, reason="non_retryable_client_error", error=api_error, - ) - print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") - print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") - logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") - self._persist_session(messages, conversation_history) - return { - "final_response": None, - "messages": messages, - "api_calls": api_call_count, - "completed": False, - "failed": True, - "error": str(api_error), - } - - # Check for non-retryable errors (context length exceeded) + # Check for context-length errors BEFORE generic 4xx handler. + # OpenRouter returns 400 (not 413) for "maximum context length" + # errors — if we let the generic 4xx handler catch those first, + # it aborts immediately instead of attempting compression+retry. is_context_length_error = any(phrase in error_msg for phrase in [ 'context length', 'maximum context', 'token limit', 'too many tokens', 'reduce the length', 'exceeds the limit', @@ -3348,6 +3366,37 @@ class AIAgent: "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.", "partial": True } + + # Check for non-retryable client errors (4xx HTTP status codes). + # These indicate a problem with the request itself (bad model ID, + # invalid API key, forbidden, etc.) and will never succeed on retry. + # Note: 413 and context-length errors are excluded — handled above + # via compression. + is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413 + is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [ + 'error code: 400', 'error code: 401', 'error code: 403', + 'error code: 404', 'error code: 422', + 'is not a valid model', 'invalid model', 'model not found', + 'invalid api key', 'invalid_api_key', 'authentication', + 'unauthorized', 'forbidden', 'not found', + ])) and not is_context_length_error + + if is_client_error: + self._dump_api_request_debug( + api_kwargs, reason="non_retryable_client_error", error=api_error, + ) + print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") + print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") + logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") + self._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "failed": True, + "error": str(api_error), + } if retry_count >= max_retries: print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.") diff --git a/tests/test_413_compression.py b/tests/test_413_compression.py index e6e0c216e..744fe41f1 100644 --- a/tests/test_413_compression.py +++ b/tests/test_413_compression.py @@ -1,7 +1,9 @@ -"""Tests for 413 payload-too-large → compression retry logic in AIAgent. +"""Tests for payload/context-length → compression retry logic in AIAgent. -Verifies that HTTP 413 errors trigger history compression and retry, -rather than being treated as non-retryable generic 4xx errors. +Verifies that: +- HTTP 413 errors trigger history compression and retry +- HTTP 400 context-length errors trigger compression (not generic 4xx abort) +- Preflight compression proactively compresses oversized sessions before API calls """ import uuid @@ -164,6 +166,74 @@ class TestHTTP413Compression: mock_compress.assert_called_once() assert result["completed"] is True + def test_400_context_length_triggers_compression(self, agent): + """A 400 with 'maximum context length' should trigger compression, not abort as generic 4xx. + + OpenRouter returns HTTP 400 (not 413) for context-length errors. Before + the fix, this was caught by the generic 4xx handler which aborted + immediately — now it correctly triggers compression+retry. + """ + err_400 = Exception( + "Error code: 400 - {'error': {'message': " + "\"This endpoint's maximum context length is 204800 tokens. " + "However, you requested about 270460 tokens.\", 'code': 400}}" + ) + err_400.status_code = 400 + ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [err_400, ok_resp] + + prefill = [ + {"role": "user", "content": "previous question"}, + {"role": "assistant", "content": "previous answer"}, + ] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "hello"}], + "compressed prompt", + ) + result = agent.run_conversation("hello", conversation_history=prefill) + + mock_compress.assert_called_once() + # Must NOT have "failed": True (which would mean the generic 4xx handler caught it) + assert result.get("failed") is not True + assert result["completed"] is True + assert result["final_response"] == "Recovered after compression" + + def test_400_reduce_length_triggers_compression(self, agent): + """A 400 with 'reduce the length' should trigger compression.""" + err_400 = Exception( + "Error code: 400 - Please reduce the length of the messages" + ) + err_400.status_code = 400 + ok_resp = _mock_response(content="OK", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [err_400, ok_resp] + + prefill = [ + {"role": "user", "content": "previous question"}, + {"role": "assistant", "content": "previous answer"}, + ] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "hello"}], + "compressed", + ) + result = agent.run_conversation("hello", conversation_history=prefill) + + mock_compress.assert_called_once() + assert result["completed"] is True + def test_413_cannot_compress_further(self, agent): """When compression can't reduce messages, return partial result.""" err_413 = _make_413_error() @@ -185,3 +255,95 @@ class TestHTTP413Compression: assert result["completed"] is False assert result.get("partial") is True assert "413" in result["error"] + + +class TestPreflightCompression: + """Preflight compression should compress history before the first API call.""" + + def test_preflight_compresses_oversized_history(self, agent): + """When loaded history exceeds the model's context threshold, compress before API call.""" + agent.compression_enabled = True + # Set a very small context so the history is "oversized" + agent.context_compressor.context_length = 100 + agent.context_compressor.threshold_tokens = 85 # 85% of 100 + + # Build a history that will be large enough to trigger preflight + # (each message ~20 chars = ~5 tokens, 20 messages = ~100 tokens > 85 threshold) + big_history = [] + for i in range(20): + big_history.append({"role": "user", "content": f"Message number {i} with some extra text padding"}) + big_history.append({"role": "assistant", "content": f"Response number {i} with extra padding here"}) + + ok_resp = _mock_response(content="After preflight", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + # Simulate compression reducing messages + mock_compress.return_value = ( + [ + {"role": "user", "content": "[CONTEXT SUMMARY]: Previous conversation"}, + {"role": "user", "content": "hello"}, + ], + "new system prompt", + ) + result = agent.run_conversation("hello", conversation_history=big_history) + + # Preflight compression should have been called BEFORE the API call + mock_compress.assert_called_once() + assert result["completed"] is True + assert result["final_response"] == "After preflight" + + def test_no_preflight_when_under_threshold(self, agent): + """When history fits within context, no preflight compression needed.""" + agent.compression_enabled = True + # Large context — history easily fits + agent.context_compressor.context_length = 1000000 + agent.context_compressor.threshold_tokens = 850000 + + small_history = [ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}, + ] + + ok_resp = _mock_response(content="No compression needed", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=small_history) + + mock_compress.assert_not_called() + assert result["completed"] is True + + def test_no_preflight_when_compression_disabled(self, agent): + """Preflight should not run when compression is disabled.""" + agent.compression_enabled = False + agent.context_compressor.context_length = 100 + agent.context_compressor.threshold_tokens = 85 + + big_history = [ + {"role": "user", "content": "x" * 1000}, + {"role": "assistant", "content": "y" * 1000}, + ] * 10 + + ok_resp = _mock_response(content="OK", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [ok_resp] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=big_history) + + mock_compress.assert_not_called()