* fix: prevent infinite 400 failure loop on context overflow (#1630) When a gateway session exceeds the model's context window, Anthropic may return a generic 400 invalid_request_error with just 'Error' as the message. This bypassed the phrase-based context-length detection, causing the agent to treat it as a non-retryable client error. Worse, the failed user message was still persisted to the transcript, making the session even larger on each attempt — creating an infinite loop. Three-layer fix: 1. run_agent.py — Fallback heuristic: when a 400 error has a very short generic message AND the session is large (>40% of context or >80 messages), treat it as a probable context overflow and trigger compression instead of aborting. 2. run_agent.py + gateway/run.py — Don't persist failed messages: when the agent returns failed=True before generating any response, skip writing the user's message to the transcript/DB. This prevents the session from growing on each failure. 3. gateway/run.py — Smarter error messages: detect context-overflow failures and suggest /compact or /reset specifically, instead of a generic 'try again' that will fail identically. * fix(skills): detect prompt injection patterns and block cache file reads Adds two security layers to prevent prompt injection via skills hub cache files (#1558): 1. read_file: blocks direct reads of ~/.hermes/skills/.hub/ directory (index-cache, catalog files). The 3.5MB clawhub_catalog_v1.json was the original injection vector — untrusted skill descriptions in the catalog contained adversarial text that the model executed. 2. skill_view: warns when skills are loaded from outside the trusted ~/.hermes/skills/ directory, and detects common injection patterns in skill content ("ignore previous instructions", "<system>", etc.). Cherry-picked from PR #1562 by ygd58. --------- Co-authored-by: buray <ygd58@users.noreply.github.com>
269 lines
11 KiB
Python
269 lines
11 KiB
Python
"""Tests for #1630 — gateway infinite 400 failure loop prevention.
|
|
|
|
Verifies that:
|
|
1. Generic 400 errors with large sessions are treated as context-length errors
|
|
and trigger compression instead of aborting.
|
|
2. The gateway does not persist messages when the agent fails early, preventing
|
|
the session from growing on each failure.
|
|
3. Context-overflow failures produce helpful error messages suggesting /compact.
|
|
"""
|
|
|
|
import pytest
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 1: Agent heuristic — generic 400 with large session → compression
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGeneric400Heuristic:
|
|
"""The agent should treat a generic 400 with a large session as a
|
|
probable context-length error and trigger compression, not abort."""
|
|
|
|
def _make_agent(self):
|
|
"""Create a minimal AIAgent for testing error handling."""
|
|
with (
|
|
patch("run_agent.get_tool_definitions", return_value=[]),
|
|
patch("run_agent.check_toolset_requirements", return_value={}),
|
|
patch("run_agent.OpenAI"),
|
|
):
|
|
from run_agent import AIAgent
|
|
a = AIAgent(
|
|
api_key="test-key-12345",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
a.client = MagicMock()
|
|
a._cached_system_prompt = "You are helpful."
|
|
a._use_prompt_caching = False
|
|
a.tool_delay = 0
|
|
a.compression_enabled = False
|
|
return a
|
|
|
|
def test_generic_400_with_small_session_is_client_error(self):
|
|
"""A generic 400 with a small session should still be treated
|
|
as a non-retryable client error (not context overflow)."""
|
|
error_msg = "error"
|
|
status_code = 400
|
|
approx_tokens = 1000 # Small session
|
|
api_messages = [{"role": "user", "content": "hi"}]
|
|
|
|
# Simulate the phrase matching
|
|
is_context_length_error = any(phrase in error_msg for phrase in [
|
|
'context length', 'context size', 'maximum context',
|
|
'token limit', 'too many tokens', 'reduce the length',
|
|
'exceeds the limit', 'context window',
|
|
'request entity too large',
|
|
'prompt is too long',
|
|
])
|
|
assert not is_context_length_error
|
|
|
|
# The heuristic should NOT trigger for small sessions
|
|
ctx_len = 200000
|
|
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
|
|
is_generic_error = len(error_msg.strip()) < 30
|
|
assert not is_large_session # Small session → heuristic doesn't fire
|
|
|
|
def test_generic_400_with_large_token_count_triggers_heuristic(self):
|
|
"""A generic 400 with high token count should be treated as
|
|
probable context overflow."""
|
|
error_msg = "error"
|
|
status_code = 400
|
|
ctx_len = 200000
|
|
approx_tokens = 100000 # > 40% of 200k
|
|
api_messages = [{"role": "user", "content": "hi"}] * 20
|
|
|
|
is_context_length_error = any(phrase in error_msg for phrase in [
|
|
'context length', 'context size', 'maximum context',
|
|
])
|
|
assert not is_context_length_error
|
|
|
|
# Heuristic check
|
|
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
|
|
is_generic_error = len(error_msg.strip()) < 30
|
|
assert is_large_session
|
|
assert is_generic_error
|
|
# Both conditions true → should be treated as context overflow
|
|
|
|
def test_generic_400_with_many_messages_triggers_heuristic(self):
|
|
"""A generic 400 with >80 messages should trigger the heuristic
|
|
even if estimated tokens are low."""
|
|
error_msg = "error"
|
|
status_code = 400
|
|
ctx_len = 200000
|
|
approx_tokens = 5000 # Low token estimate
|
|
api_messages = [{"role": "user", "content": "x"}] * 100 # > 80 messages
|
|
|
|
is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
|
|
is_generic_error = len(error_msg.strip()) < 30
|
|
assert is_large_session
|
|
assert is_generic_error
|
|
|
|
def test_specific_error_message_bypasses_heuristic(self):
|
|
"""A 400 with a specific, long error message should NOT trigger
|
|
the heuristic even with a large session."""
|
|
error_msg = "invalid model: anthropic/claude-nonexistent-model is not available"
|
|
status_code = 400
|
|
ctx_len = 200000
|
|
approx_tokens = 100000
|
|
|
|
is_generic_error = len(error_msg.strip()) < 30
|
|
assert not is_generic_error # Long specific message → heuristic doesn't fire
|
|
|
|
def test_descriptive_context_error_caught_by_phrases(self):
|
|
"""Descriptive context-length errors should still be caught by
|
|
the existing phrase matching (not the heuristic)."""
|
|
error_msg = "prompt is too long: 250000 tokens > 200000 maximum"
|
|
is_context_length_error = any(phrase in error_msg for phrase in [
|
|
'context length', 'context size', 'maximum context',
|
|
'token limit', 'too many tokens', 'reduce the length',
|
|
'exceeds the limit', 'context window',
|
|
'request entity too large',
|
|
'prompt is too long',
|
|
])
|
|
assert is_context_length_error
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 2: Gateway skips persistence on failed agent results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestGatewaySkipsPersistenceOnFailure:
|
|
"""When the agent returns failed=True with no final_response,
|
|
the gateway should NOT persist messages to the transcript."""
|
|
|
|
def test_agent_failed_early_detected(self):
|
|
"""The agent_failed_early flag is True when failed=True and
|
|
no final_response."""
|
|
agent_result = {
|
|
"failed": True,
|
|
"final_response": None,
|
|
"messages": [],
|
|
"error": "Non-retryable client error",
|
|
}
|
|
agent_failed_early = (
|
|
agent_result.get("failed")
|
|
and not agent_result.get("final_response")
|
|
)
|
|
assert agent_failed_early
|
|
|
|
def test_agent_with_response_not_failed_early(self):
|
|
"""When the agent has a final_response, it's not a failed-early
|
|
scenario even if failed=True."""
|
|
agent_result = {
|
|
"failed": True,
|
|
"final_response": "Here is a partial response",
|
|
"messages": [],
|
|
}
|
|
agent_failed_early = (
|
|
agent_result.get("failed")
|
|
and not agent_result.get("final_response")
|
|
)
|
|
assert not agent_failed_early
|
|
|
|
def test_successful_agent_not_failed_early(self):
|
|
"""A successful agent result should not trigger skip."""
|
|
agent_result = {
|
|
"final_response": "Hello!",
|
|
"messages": [{"role": "assistant", "content": "Hello!"}],
|
|
}
|
|
agent_failed_early = (
|
|
agent_result.get("failed")
|
|
and not agent_result.get("final_response")
|
|
)
|
|
assert not agent_failed_early
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 3: Context-overflow error messages
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestContextOverflowErrorMessages:
|
|
"""The gateway should produce helpful error messages when the failure
|
|
looks like a context overflow."""
|
|
|
|
def test_detects_context_keywords(self):
|
|
"""Error messages containing context-related keywords should be
|
|
identified as context failures."""
|
|
keywords = [
|
|
"context length exceeded",
|
|
"too many tokens in the prompt",
|
|
"request entity too large",
|
|
"payload too large for model",
|
|
"context window exceeded",
|
|
]
|
|
for error_str in keywords:
|
|
_is_ctx_fail = any(p in error_str.lower() for p in (
|
|
"context", "token", "too large", "too long",
|
|
"exceed", "payload",
|
|
))
|
|
assert _is_ctx_fail, f"Should detect: {error_str}"
|
|
|
|
def test_detects_generic_400_with_large_history(self):
|
|
"""A generic 400 error code in the string with a large history
|
|
should be flagged as context failure."""
|
|
error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}"
|
|
history_len = 100 # Large session
|
|
|
|
_is_ctx_fail = any(p in error_str.lower() for p in (
|
|
"context", "token", "too large", "too long",
|
|
"exceed", "payload",
|
|
)) or (
|
|
"400" in error_str.lower()
|
|
and history_len > 50
|
|
)
|
|
assert _is_ctx_fail
|
|
|
|
def test_unrelated_error_not_flagged(self):
|
|
"""Unrelated errors should not be flagged as context failures."""
|
|
error_str = "invalid api key: authentication failed"
|
|
history_len = 10
|
|
|
|
_is_ctx_fail = any(p in error_str.lower() for p in (
|
|
"context", "token", "too large", "too long",
|
|
"exceed", "payload",
|
|
)) or (
|
|
"400" in error_str.lower()
|
|
and history_len > 50
|
|
)
|
|
assert not _is_ctx_fail
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 4: Agent skips persistence for large failed sessions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestAgentSkipsPersistenceForLargeFailedSessions:
|
|
"""When a 400 error occurs and the session is large, the agent
|
|
should skip persisting to prevent the growth loop."""
|
|
|
|
def test_large_session_400_skips_persistence(self):
|
|
"""Status 400 + high token count should skip persistence."""
|
|
status_code = 400
|
|
approx_tokens = 60000 # > 50000 threshold
|
|
api_messages = [{"role": "user", "content": "x"}] * 10
|
|
|
|
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
|
|
assert should_skip
|
|
|
|
def test_small_session_400_persists_normally(self):
|
|
"""Status 400 + small session should still persist."""
|
|
status_code = 400
|
|
approx_tokens = 5000 # < 50000
|
|
api_messages = [{"role": "user", "content": "x"}] * 10 # < 80
|
|
|
|
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
|
|
assert not should_skip
|
|
|
|
def test_non_400_error_persists_normally(self):
|
|
"""Non-400 errors should always persist normally."""
|
|
status_code = 401 # Auth error
|
|
approx_tokens = 100000 # Large session, but not a 400
|
|
api_messages = [{"role": "user", "content": "x"}] * 100
|
|
|
|
should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
|
|
assert not should_skip
|