fix: proactive compression after large tool results + Anthropic error detection
Two fixes for context overflow handling: 1. Proactive compression after tool execution: The compression check now estimates the next prompt size using real token counts from the last API response (prompt_tokens + completion_tokens) plus a conservative estimate of newly appended tool results (chars // 3 for JSON-heavy content). Previously, should_compress() only checked last_prompt_tokens which didn't account for tool results — so a 130k prompt + 100k chars of tool output would pass the 140k threshold check but fail the 200k API limit. 2. Safety net: Added 'prompt is too long' to context-length error detection phrases. Anthropic returns 'prompt is too long: N tokens > M maximum' on HTTP 400, which wasn't matched by existing phrases. This ensures compression fires even if the proactive check underestimates. Fixes #813
This commit is contained in:
17
run_agent.py
17
run_agent.py
@@ -3872,6 +3872,7 @@ class AIAgent:
|
||||
'token limit', 'too many tokens', 'reduce the length',
|
||||
'exceeds the limit', 'context window',
|
||||
'request entity too large', # OpenRouter/Nous 413 safety net
|
||||
'prompt is too long', # Anthropic: "prompt is too long: N tokens > M maximum"
|
||||
])
|
||||
|
||||
if is_context_length_error:
|
||||
@@ -4256,6 +4257,7 @@ class AIAgent:
|
||||
|
||||
messages.append(assistant_msg)
|
||||
|
||||
_msg_count_before_tools = len(messages)
|
||||
self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)
|
||||
|
||||
# Refund the iteration if the ONLY tool(s) called were
|
||||
@@ -4265,7 +4267,20 @@ class AIAgent:
|
||||
if _tc_names == {"execute_code"}:
|
||||
self.iteration_budget.refund()
|
||||
|
||||
if self.compression_enabled and self.context_compressor.should_compress():
|
||||
# Estimate next prompt size using real token counts from the
|
||||
# last API response + rough estimate of newly appended tool
|
||||
# results. This catches cases where tool results push the
|
||||
# context past the limit that last_prompt_tokens alone misses
|
||||
# (e.g. large file reads, web extractions).
|
||||
_compressor = self.context_compressor
|
||||
_new_tool_msgs = messages[_msg_count_before_tools:]
|
||||
_new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
|
||||
_estimated_next_prompt = (
|
||||
_compressor.last_prompt_tokens
|
||||
+ _compressor.last_completion_tokens
|
||||
+ _new_chars // 3 # conservative: JSON-heavy tool results ≈ 3 chars/token
|
||||
)
|
||||
if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
|
||||
messages, active_system_prompt = self._compress_context(
|
||||
messages, system_message,
|
||||
approx_tokens=self.context_compressor.last_prompt_tokens,
|
||||
|
||||
@@ -396,3 +396,73 @@ class TestPreflightCompression:
|
||||
result = agent.run_conversation("hello", conversation_history=big_history)
|
||||
|
||||
mock_compress.assert_not_called()
|
||||
|
||||
|
||||
class TestToolResultPreflightCompression:
|
||||
"""Compression should trigger when tool results push context past the threshold."""
|
||||
|
||||
def test_large_tool_results_trigger_compression(self, agent):
|
||||
"""When tool results push estimated tokens past threshold, compress before next call."""
|
||||
agent.compression_enabled = True
|
||||
agent.context_compressor.context_length = 200_000
|
||||
agent.context_compressor.threshold_tokens = 140_000
|
||||
agent.context_compressor.last_prompt_tokens = 130_000
|
||||
agent.context_compressor.last_completion_tokens = 5_000
|
||||
|
||||
tc = SimpleNamespace(
|
||||
id="tc1", type="function",
|
||||
function=SimpleNamespace(name="web_search", arguments='{"query":"test"}'),
|
||||
)
|
||||
tool_resp = _mock_response(
|
||||
content=None, finish_reason="stop", tool_calls=[tc],
|
||||
usage={"prompt_tokens": 130_000, "completion_tokens": 5_000, "total_tokens": 135_000},
|
||||
)
|
||||
ok_resp = _mock_response(
|
||||
content="Done after compression", finish_reason="stop",
|
||||
usage={"prompt_tokens": 50_000, "completion_tokens": 100, "total_tokens": 50_100},
|
||||
)
|
||||
agent.client.chat.completions.create.side_effect = [tool_resp, ok_resp]
|
||||
large_result = "x" * 100_000
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value=large_result),
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
mock_compress.return_value = (
|
||||
[{"role": "user", "content": "hello"}], "compressed prompt",
|
||||
)
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_anthropic_prompt_too_long_safety_net(self, agent):
|
||||
"""Anthropic 'prompt is too long' error triggers compression as safety net."""
|
||||
err_400 = Exception(
|
||||
"Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', "
|
||||
"'message': 'prompt is too long: 233153 tokens > 200000 maximum'}}"
|
||||
)
|
||||
err_400.status_code = 400
|
||||
ok_resp = _mock_response(content="Recovered", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
|
||||
prefill = [
|
||||
{"role": "user", "content": "previous"},
|
||||
{"role": "assistant", "content": "answer"},
|
||||
]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
mock_compress.return_value = (
|
||||
[{"role": "user", "content": "hello"}], "compressed",
|
||||
)
|
||||
result = agent.run_conversation("hello", conversation_history=prefill)
|
||||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["completed"] is True
|
||||
|
||||
Reference in New Issue
Block a user