Files
hermes-agent/tests/run_agent/test_1630_context_overflow_loop.py
Siddharth Balyan f3006ebef9 refactor(tests): re-architect tests + fix CI failures (#5946)
* refactor: re-architect tests to mirror the codebase

* Update tests.yml

* fix: add missing tool_error imports after registry refactor

* fix(tests): replace patch.dict with monkeypatch to prevent env var leaks under xdist

patch.dict(os.environ) can leak TERMINAL_ENV across xdist workers,
causing test_code_execution tests to hit the Modal remote path.

* fix(tests): fix update_check and telegram xdist failures

- test_update_check: replace patch("hermes_cli.banner.os.getenv") with
  monkeypatch.setenv("HERMES_HOME") — banner.py no longer imports os
  directly, it uses get_hermes_home() from hermes_constants.

- test_telegram_conflict/approval_buttons: provide real exception classes
  for telegram.error mock (NetworkError, TimedOut, BadRequest) so the
  except clause in connect() doesn't fail with "catching classes that do
  not inherit from BaseException" when xdist pollutes sys.modules.

* fix(tests): accept unavailable_models kwarg in _prompt_model_selection mock
2026-04-07 17:19:07 -07:00

269 lines
11 KiB
Python

"""Tests for #1630 — gateway infinite 400 failure loop prevention.
Verifies that:
1. Generic 400 errors with large sessions are treated as context-length errors
and trigger compression instead of aborting.
2. The gateway does not persist messages when the agent fails early, preventing
the session from growing on each failure.
3. Context-overflow failures produce helpful error messages suggesting /compact.
"""
import pytest
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
# ---------------------------------------------------------------------------
# Test 1: Agent heuristic — generic 400 with large session → compression
# ---------------------------------------------------------------------------
class TestGeneric400Heuristic:
"""The agent should treat a generic 400 with a large session as a
probable context-length error and trigger compression, not abort."""
def _make_agent(self):
"""Create a minimal AIAgent for testing error handling."""
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
):
from run_agent import AIAgent
a = AIAgent(
api_key="test-key-12345",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
a.client = MagicMock()
a._cached_system_prompt = "You are helpful."
a._use_prompt_caching = False
a.tool_delay = 0
a.compression_enabled = False
return a
def test_generic_400_with_small_session_is_client_error(self):
"""A generic 400 with a small session should still be treated
as a non-retryable client error (not context overflow)."""
error_msg = "error"
status_code = 400
approx_tokens = 1000 # Small session
api_messages = [{"role": "user", "content": "hi"}]
# Simulate the phrase matching
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'context size', 'maximum context',
'token limit', 'too many tokens', 'reduce the length',
'exceeds the limit', 'context window',
'request entity too large',
'prompt is too long',
])
assert not is_context_length_error
# The heuristic should NOT trigger for small sessions
ctx_len = 200000
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
is_generic_error = len(error_msg.strip()) < 30
assert not is_large_session # Small session → heuristic doesn't fire
def test_generic_400_with_large_token_count_triggers_heuristic(self):
"""A generic 400 with high token count should be treated as
probable context overflow."""
error_msg = "error"
status_code = 400
ctx_len = 200000
approx_tokens = 100000 # > 40% of 200k
api_messages = [{"role": "user", "content": "hi"}] * 20
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'context size', 'maximum context',
])
assert not is_context_length_error
# Heuristic check
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
is_generic_error = len(error_msg.strip()) < 30
assert is_large_session
assert is_generic_error
# Both conditions true → should be treated as context overflow
def test_generic_400_with_many_messages_triggers_heuristic(self):
"""A generic 400 with >80 messages should trigger the heuristic
even if estimated tokens are low."""
error_msg = "error"
status_code = 400
ctx_len = 200000
approx_tokens = 5000 # Low token estimate
api_messages = [{"role": "user", "content": "x"}] * 100 # > 80 messages
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
is_generic_error = len(error_msg.strip()) < 30
assert is_large_session
assert is_generic_error
def test_specific_error_message_bypasses_heuristic(self):
"""A 400 with a specific, long error message should NOT trigger
the heuristic even with a large session."""
error_msg = "invalid model: anthropic/claude-nonexistent-model is not available"
status_code = 400
ctx_len = 200000
approx_tokens = 100000
is_generic_error = len(error_msg.strip()) < 30
assert not is_generic_error # Long specific message → heuristic doesn't fire
def test_descriptive_context_error_caught_by_phrases(self):
"""Descriptive context-length errors should still be caught by
the existing phrase matching (not the heuristic)."""
error_msg = "prompt is too long: 250000 tokens > 200000 maximum"
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'context size', 'maximum context',
'token limit', 'too many tokens', 'reduce the length',
'exceeds the limit', 'context window',
'request entity too large',
'prompt is too long',
])
assert is_context_length_error
# ---------------------------------------------------------------------------
# Test 2: Gateway skips persistence on failed agent results
# ---------------------------------------------------------------------------
class TestGatewaySkipsPersistenceOnFailure:
"""When the agent returns failed=True with no final_response,
the gateway should NOT persist messages to the transcript."""
def test_agent_failed_early_detected(self):
"""The agent_failed_early flag is True when failed=True and
no final_response."""
agent_result = {
"failed": True,
"final_response": None,
"messages": [],
"error": "Non-retryable client error",
}
agent_failed_early = (
agent_result.get("failed")
and not agent_result.get("final_response")
)
assert agent_failed_early
def test_agent_with_response_not_failed_early(self):
"""When the agent has a final_response, it's not a failed-early
scenario even if failed=True."""
agent_result = {
"failed": True,
"final_response": "Here is a partial response",
"messages": [],
}
agent_failed_early = (
agent_result.get("failed")
and not agent_result.get("final_response")
)
assert not agent_failed_early
def test_successful_agent_not_failed_early(self):
"""A successful agent result should not trigger skip."""
agent_result = {
"final_response": "Hello!",
"messages": [{"role": "assistant", "content": "Hello!"}],
}
agent_failed_early = (
agent_result.get("failed")
and not agent_result.get("final_response")
)
assert not agent_failed_early
# ---------------------------------------------------------------------------
# Test 3: Context-overflow error messages
# ---------------------------------------------------------------------------
class TestContextOverflowErrorMessages:
"""The gateway should produce helpful error messages when the failure
looks like a context overflow."""
def test_detects_context_keywords(self):
"""Error messages containing context-related keywords should be
identified as context failures."""
keywords = [
"context length exceeded",
"too many tokens in the prompt",
"request entity too large",
"payload too large for model",
"context window exceeded",
]
for error_str in keywords:
_is_ctx_fail = any(p in error_str.lower() for p in (
"context", "token", "too large", "too long",
"exceed", "payload",
))
assert _is_ctx_fail, f"Should detect: {error_str}"
def test_detects_generic_400_with_large_history(self):
"""A generic 400 error code in the string with a large history
should be flagged as context failure."""
error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}"
history_len = 100 # Large session
_is_ctx_fail = any(p in error_str.lower() for p in (
"context", "token", "too large", "too long",
"exceed", "payload",
)) or (
"400" in error_str.lower()
and history_len > 50
)
assert _is_ctx_fail
def test_unrelated_error_not_flagged(self):
"""Unrelated errors should not be flagged as context failures."""
error_str = "invalid api key: authentication failed"
history_len = 10
_is_ctx_fail = any(p in error_str.lower() for p in (
"context", "token", "too large", "too long",
"exceed", "payload",
)) or (
"400" in error_str.lower()
and history_len > 50
)
assert not _is_ctx_fail
# ---------------------------------------------------------------------------
# Test 4: Agent skips persistence for large failed sessions
# ---------------------------------------------------------------------------
class TestAgentSkipsPersistenceForLargeFailedSessions:
"""When a 400 error occurs and the session is large, the agent
should skip persisting to prevent the growth loop."""
def test_large_session_400_skips_persistence(self):
"""Status 400 + high token count should skip persistence."""
status_code = 400
approx_tokens = 60000 # > 50000 threshold
api_messages = [{"role": "user", "content": "x"}] * 10
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
assert should_skip
def test_small_session_400_persists_normally(self):
"""Status 400 + small session should still persist."""
status_code = 400
approx_tokens = 5000 # < 50000
api_messages = [{"role": "user", "content": "x"}] * 10 # < 80
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
assert not should_skip
def test_non_400_error_persists_normally(self):
"""Non-400 errors should always persist normally."""
status_code = 401 # Auth error
approx_tokens = 100000 # Large session, but not a 400
api_messages = [{"role": "user", "content": "x"}] * 100
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
assert not should_skip