issue-151/tests/test_1630_context_overflow_loop.py

"""Tests for #1630 — gateway infinite 400 failure loop prevention.

Verifies that:
1. Generic 400 errors with large sessions are treated as context-length errors
   and trigger compression instead of aborting.
2. The gateway does not persist messages when the agent fails early, preventing
   the session from growing on each failure.
3. Context-overflow failures produce helpful error messages suggesting /compact.
"""

import pytest
from types import SimpleNamespace
from unittest.mock import MagicMock, patch


# ---------------------------------------------------------------------------
# Test 1: Agent heuristic — generic 400 with large session → compression
# ---------------------------------------------------------------------------


class TestGeneric400Heuristic:
    """The agent should treat a generic 400 with a large session as a
    probable context-length error and trigger compression, not abort."""

    def _make_agent(self):
        """Create a minimal AIAgent for testing error handling."""
        with (
            patch("run_agent.get_tool_definitions", return_value=[]),
            patch("run_agent.check_toolset_requirements", return_value={}),
            patch("run_agent.OpenAI"),
        ):
            from run_agent import AIAgent
            a = AIAgent(
                api_key="test-key-12345",
                quiet_mode=True,
                skip_context_files=True,
                skip_memory=True,
            )
            a.client = MagicMock()
            a._cached_system_prompt = "You are helpful."
            a._use_prompt_caching = False
            a.tool_delay = 0
            a.compression_enabled = False
            return a

    def test_generic_400_with_small_session_is_client_error(self):
        """A generic 400 with a small session should still be treated
        as a non-retryable client error (not context overflow)."""
        error_msg = "error"
        status_code = 400
        approx_tokens = 1000  # Small session
        api_messages = [{"role": "user", "content": "hi"}]

        # Simulate the phrase matching
        is_context_length_error = any(phrase in error_msg for phrase in [
            'context length', 'context size', 'maximum context',
            'token limit', 'too many tokens', 'reduce the length',
            'exceeds the limit', 'context window',
            'request entity too large',
            'prompt is too long',
        ])
        assert not is_context_length_error

        # The heuristic should NOT trigger for small sessions
        ctx_len = 200000
        is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
        is_generic_error = len(error_msg.strip()) < 30
        assert not is_large_session  # Small session → heuristic doesn't fire

    def test_generic_400_with_large_token_count_triggers_heuristic(self):
        """A generic 400 with high token count should be treated as
        probable context overflow."""
        error_msg = "error"
        status_code = 400
        ctx_len = 200000
        approx_tokens = 100000  # > 40% of 200k
        api_messages = [{"role": "user", "content": "hi"}] * 20

        is_context_length_error = any(phrase in error_msg for phrase in [
            'context length', 'context size', 'maximum context',
        ])
        assert not is_context_length_error

        # Heuristic check
        is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
        is_generic_error = len(error_msg.strip()) < 30
        assert is_large_session
        assert is_generic_error
        # Both conditions true → should be treated as context overflow

    def test_generic_400_with_many_messages_triggers_heuristic(self):
        """A generic 400 with >80 messages should trigger the heuristic
        even if estimated tokens are low."""
        error_msg = "error"
        status_code = 400
        ctx_len = 200000
        approx_tokens = 5000  # Low token estimate
        api_messages = [{"role": "user", "content": "x"}] * 100  # > 80 messages

        is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80
        is_generic_error = len(error_msg.strip()) < 30
        assert is_large_session
        assert is_generic_error

    def test_specific_error_message_bypasses_heuristic(self):
        """A 400 with a specific, long error message should NOT trigger
        the heuristic even with a large session."""
        error_msg = "invalid model: anthropic/claude-nonexistent-model is not available"
        status_code = 400
        ctx_len = 200000
        approx_tokens = 100000

        is_generic_error = len(error_msg.strip()) < 30
        assert not is_generic_error  # Long specific message → heuristic doesn't fire

    def test_descriptive_context_error_caught_by_phrases(self):
        """Descriptive context-length errors should still be caught by
        the existing phrase matching (not the heuristic)."""
        error_msg = "prompt is too long: 250000 tokens > 200000 maximum"
        is_context_length_error = any(phrase in error_msg for phrase in [
            'context length', 'context size', 'maximum context',
            'token limit', 'too many tokens', 'reduce the length',
            'exceeds the limit', 'context window',
            'request entity too large',
            'prompt is too long',
        ])
        assert is_context_length_error


# ---------------------------------------------------------------------------
# Test 2: Gateway skips persistence on failed agent results
# ---------------------------------------------------------------------------

class TestGatewaySkipsPersistenceOnFailure:
    """When the agent returns failed=True with no final_response,
    the gateway should NOT persist messages to the transcript."""

    def test_agent_failed_early_detected(self):
        """The agent_failed_early flag is True when failed=True and
        no final_response."""
        agent_result = {
            "failed": True,
            "final_response": None,
            "messages": [],
            "error": "Non-retryable client error",
        }
        agent_failed_early = (
            agent_result.get("failed")
            and not agent_result.get("final_response")
        )
        assert agent_failed_early

    def test_agent_with_response_not_failed_early(self):
        """When the agent has a final_response, it's not a failed-early
        scenario even if failed=True."""
        agent_result = {
            "failed": True,
            "final_response": "Here is a partial response",
            "messages": [],
        }
        agent_failed_early = (
            agent_result.get("failed")
            and not agent_result.get("final_response")
        )
        assert not agent_failed_early

    def test_successful_agent_not_failed_early(self):
        """A successful agent result should not trigger skip."""
        agent_result = {
            "final_response": "Hello!",
            "messages": [{"role": "assistant", "content": "Hello!"}],
        }
        agent_failed_early = (
            agent_result.get("failed")
            and not agent_result.get("final_response")
        )
        assert not agent_failed_early


# ---------------------------------------------------------------------------
# Test 3: Context-overflow error messages
# ---------------------------------------------------------------------------

class TestContextOverflowErrorMessages:
    """The gateway should produce helpful error messages when the failure
    looks like a context overflow."""

    def test_detects_context_keywords(self):
        """Error messages containing context-related keywords should be
        identified as context failures."""
        keywords = [
            "context length exceeded",
            "too many tokens in the prompt",
            "request entity too large",
            "payload too large for model",
            "context window exceeded",
        ]
        for error_str in keywords:
            _is_ctx_fail = any(p in error_str.lower() for p in (
                "context", "token", "too large", "too long",
                "exceed", "payload",
            ))
            assert _is_ctx_fail, f"Should detect: {error_str}"

    def test_detects_generic_400_with_large_history(self):
        """A generic 400 error code in the string with a large history
        should be flagged as context failure."""
        error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}"
        history_len = 100  # Large session

        _is_ctx_fail = any(p in error_str.lower() for p in (
            "context", "token", "too large", "too long",
            "exceed", "payload",
        )) or (
            "400" in error_str.lower()
            and history_len > 50
        )
        assert _is_ctx_fail

    def test_unrelated_error_not_flagged(self):
        """Unrelated errors should not be flagged as context failures."""
        error_str = "invalid api key: authentication failed"
        history_len = 10

        _is_ctx_fail = any(p in error_str.lower() for p in (
            "context", "token", "too large", "too long",
            "exceed", "payload",
        )) or (
            "400" in error_str.lower()
            and history_len > 50
        )
        assert not _is_ctx_fail


# ---------------------------------------------------------------------------
# Test 4: Agent skips persistence for large failed sessions
# ---------------------------------------------------------------------------

class TestAgentSkipsPersistenceForLargeFailedSessions:
    """When a 400 error occurs and the session is large, the agent
    should skip persisting to prevent the growth loop."""

    def test_large_session_400_skips_persistence(self):
        """Status 400 + high token count should skip persistence."""
        status_code = 400
        approx_tokens = 60000  # > 50000 threshold
        api_messages = [{"role": "user", "content": "x"}] * 10

        should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
        assert should_skip

    def test_small_session_400_persists_normally(self):
        """Status 400 + small session should still persist."""
        status_code = 400
        approx_tokens = 5000  # < 50000
        api_messages = [{"role": "user", "content": "x"}] * 10  # < 80

        should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
        assert not should_skip

    def test_non_400_error_persists_normally(self):
        """Non-400 errors should always persist normally."""
        status_code = 401  # Auth error
        approx_tokens = 100000  # Large session, but not a 400
        api_messages = [{"role": "user", "content": "x"}] * 100

        should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)
        assert not should_skip
fix: prevent infinite 400 loop on context overflow + block prompt injection via cache files (#1630, #1558) * fix: prevent infinite 400 failure loop on context overflow (#1630) When a gateway session exceeds the model's context window, Anthropic may return a generic 400 invalid_request_error with just 'Error' as the message. This bypassed the phrase-based context-length detection, causing the agent to treat it as a non-retryable client error. Worse, the failed user message was still persisted to the transcript, making the session even larger on each attempt — creating an infinite loop. Three-layer fix: 1. run_agent.py — Fallback heuristic: when a 400 error has a very short generic message AND the session is large (>40% of context or >80 messages), treat it as a probable context overflow and trigger compression instead of aborting. 2. run_agent.py + gateway/run.py — Don't persist failed messages: when the agent returns failed=True before generating any response, skip writing the user's message to the transcript/DB. This prevents the session from growing on each failure. 3. gateway/run.py — Smarter error messages: detect context-overflow failures and suggest /compact or /reset specifically, instead of a generic 'try again' that will fail identically. * fix(skills): detect prompt injection patterns and block cache file reads Adds two security layers to prevent prompt injection via skills hub cache files (#1558): 1. read_file: blocks direct reads of ~/.hermes/skills/.hub/ directory (index-cache, catalog files). The 3.5MB clawhub_catalog_v1.json was the original injection vector — untrusted skill descriptions in the catalog contained adversarial text that the model executed. 2. skill_view: warns when skills are loaded from outside the trusted ~/.hermes/skills/ directory, and detects common injection patterns in skill content ("ignore previous instructions", "<system>", etc.). Cherry-picked from PR #1562 by ygd58. --------- Co-authored-by: buray <ygd58@users.noreply.github.com> 2026-03-17 01:50:59 -07:00			`"""Tests for #1630 — gateway infinite 400 failure loop prevention.`

			`Verifies that:`
			`1. Generic 400 errors with large sessions are treated as context-length errors`
			`and trigger compression instead of aborting.`
			`2. The gateway does not persist messages when the agent fails early, preventing`
			`the session from growing on each failure.`
			`3. Context-overflow failures produce helpful error messages suggesting /compact.`
			`"""`

			`import pytest`
			`from types import SimpleNamespace`
			`from unittest.mock import MagicMock, patch`


			`# ---------------------------------------------------------------------------`
			`# Test 1: Agent heuristic — generic 400 with large session → compression`
			`# ---------------------------------------------------------------------------`


			`class TestGeneric400Heuristic:`
			`"""The agent should treat a generic 400 with a large session as a`
			`probable context-length error and trigger compression, not abort."""`

			`def _make_agent(self):`
			`"""Create a minimal AIAgent for testing error handling."""`
			`with (`
			`patch("run_agent.get_tool_definitions", return_value=[]),`
			`patch("run_agent.check_toolset_requirements", return_value={}),`
			`patch("run_agent.OpenAI"),`
			`):`
			`from run_agent import AIAgent`
			`a = AIAgent(`
			`api_key="test-key-12345",`
			`quiet_mode=True,`
			`skip_context_files=True,`
			`skip_memory=True,`
			`)`
			`a.client = MagicMock()`
			`a._cached_system_prompt = "You are helpful."`
			`a._use_prompt_caching = False`
			`a.tool_delay = 0`
			`a.compression_enabled = False`
			`return a`

			`def test_generic_400_with_small_session_is_client_error(self):`
			`"""A generic 400 with a small session should still be treated`
			`as a non-retryable client error (not context overflow)."""`
			`error_msg = "error"`
			`status_code = 400`
			`approx_tokens = 1000 # Small session`
			`api_messages = [{"role": "user", "content": "hi"}]`

			`# Simulate the phrase matching`
			`is_context_length_error = any(phrase in error_msg for phrase in [`
			`'context length', 'context size', 'maximum context',`
			`'token limit', 'too many tokens', 'reduce the length',`
			`'exceeds the limit', 'context window',`
			`'request entity too large',`
			`'prompt is too long',`
			`])`
			`assert not is_context_length_error`

			`# The heuristic should NOT trigger for small sessions`
			`ctx_len = 200000`
			`is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80`
			`is_generic_error = len(error_msg.strip()) < 30`
			`assert not is_large_session # Small session → heuristic doesn't fire`

			`def test_generic_400_with_large_token_count_triggers_heuristic(self):`
			`"""A generic 400 with high token count should be treated as`
			`probable context overflow."""`
			`error_msg = "error"`
			`status_code = 400`
			`ctx_len = 200000`
			`approx_tokens = 100000 # > 40% of 200k`
			`api_messages = [{"role": "user", "content": "hi"}] * 20`

			`is_context_length_error = any(phrase in error_msg for phrase in [`
			`'context length', 'context size', 'maximum context',`
			`])`
			`assert not is_context_length_error`

			`# Heuristic check`
			`is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80`
			`is_generic_error = len(error_msg.strip()) < 30`
			`assert is_large_session`
			`assert is_generic_error`
			`# Both conditions true → should be treated as context overflow`

			`def test_generic_400_with_many_messages_triggers_heuristic(self):`
			`"""A generic 400 with >80 messages should trigger the heuristic`
			`even if estimated tokens are low."""`
			`error_msg = "error"`
			`status_code = 400`
			`ctx_len = 200000`
			`approx_tokens = 5000 # Low token estimate`
			`api_messages = [{"role": "user", "content": "x"}] * 100 # > 80 messages`

			`is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80`
			`is_generic_error = len(error_msg.strip()) < 30`
			`assert is_large_session`
			`assert is_generic_error`

			`def test_specific_error_message_bypasses_heuristic(self):`
			`"""A 400 with a specific, long error message should NOT trigger`
			`the heuristic even with a large session."""`
			`error_msg = "invalid model: anthropic/claude-nonexistent-model is not available"`
			`status_code = 400`
			`ctx_len = 200000`
			`approx_tokens = 100000`

			`is_generic_error = len(error_msg.strip()) < 30`
			`assert not is_generic_error # Long specific message → heuristic doesn't fire`

			`def test_descriptive_context_error_caught_by_phrases(self):`
			`"""Descriptive context-length errors should still be caught by`
			`the existing phrase matching (not the heuristic)."""`
			`error_msg = "prompt is too long: 250000 tokens > 200000 maximum"`
			`is_context_length_error = any(phrase in error_msg for phrase in [`
			`'context length', 'context size', 'maximum context',`
			`'token limit', 'too many tokens', 'reduce the length',`
			`'exceeds the limit', 'context window',`
			`'request entity too large',`
			`'prompt is too long',`
			`])`
			`assert is_context_length_error`


			`# ---------------------------------------------------------------------------`
			`# Test 2: Gateway skips persistence on failed agent results`
			`# ---------------------------------------------------------------------------`

			`class TestGatewaySkipsPersistenceOnFailure:`
			`"""When the agent returns failed=True with no final_response,`
			`the gateway should NOT persist messages to the transcript."""`

			`def test_agent_failed_early_detected(self):`
			`"""The agent_failed_early flag is True when failed=True and`
			`no final_response."""`
			`agent_result = {`
			`"failed": True,`
			`"final_response": None,`
			`"messages": [],`
			`"error": "Non-retryable client error",`
			`}`
			`agent_failed_early = (`
			`agent_result.get("failed")`
			`and not agent_result.get("final_response")`
			`)`
			`assert agent_failed_early`

			`def test_agent_with_response_not_failed_early(self):`
			`"""When the agent has a final_response, it's not a failed-early`
			`scenario even if failed=True."""`
			`agent_result = {`
			`"failed": True,`
			`"final_response": "Here is a partial response",`
			`"messages": [],`
			`}`
			`agent_failed_early = (`
			`agent_result.get("failed")`
			`and not agent_result.get("final_response")`
			`)`
			`assert not agent_failed_early`

			`def test_successful_agent_not_failed_early(self):`
			`"""A successful agent result should not trigger skip."""`
			`agent_result = {`
			`"final_response": "Hello!",`
			`"messages": [{"role": "assistant", "content": "Hello!"}],`
			`}`
			`agent_failed_early = (`
			`agent_result.get("failed")`
			`and not agent_result.get("final_response")`
			`)`
			`assert not agent_failed_early`


			`# ---------------------------------------------------------------------------`
			`# Test 3: Context-overflow error messages`
			`# ---------------------------------------------------------------------------`

			`class TestContextOverflowErrorMessages:`
			`"""The gateway should produce helpful error messages when the failure`
			`looks like a context overflow."""`

			`def test_detects_context_keywords(self):`
			`"""Error messages containing context-related keywords should be`
			`identified as context failures."""`
			`keywords = [`
			`"context length exceeded",`
			`"too many tokens in the prompt",`
			`"request entity too large",`
			`"payload too large for model",`
			`"context window exceeded",`
			`]`
			`for error_str in keywords:`
			`_is_ctx_fail = any(p in error_str.lower() for p in (`
			`"context", "token", "too large", "too long",`
			`"exceed", "payload",`
			`))`
			`assert _is_ctx_fail, f"Should detect: {error_str}"`

			`def test_detects_generic_400_with_large_history(self):`
			`"""A generic 400 error code in the string with a large history`
			`should be flagged as context failure."""`
			`error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}"`
			`history_len = 100 # Large session`

			`_is_ctx_fail = any(p in error_str.lower() for p in (`
			`"context", "token", "too large", "too long",`
			`"exceed", "payload",`
			`)) or (`
			`"400" in error_str.lower()`
			`and history_len > 50`
			`)`
			`assert _is_ctx_fail`

			`def test_unrelated_error_not_flagged(self):`
			`"""Unrelated errors should not be flagged as context failures."""`
			`error_str = "invalid api key: authentication failed"`
			`history_len = 10`

			`_is_ctx_fail = any(p in error_str.lower() for p in (`
			`"context", "token", "too large", "too long",`
			`"exceed", "payload",`
			`)) or (`
			`"400" in error_str.lower()`
			`and history_len > 50`
			`)`
			`assert not _is_ctx_fail`


			`# ---------------------------------------------------------------------------`
			`# Test 4: Agent skips persistence for large failed sessions`
			`# ---------------------------------------------------------------------------`

			`class TestAgentSkipsPersistenceForLargeFailedSessions:`
			`"""When a 400 error occurs and the session is large, the agent`
			`should skip persisting to prevent the growth loop."""`

			`def test_large_session_400_skips_persistence(self):`
			`"""Status 400 + high token count should skip persistence."""`
			`status_code = 400`
			`approx_tokens = 60000 # > 50000 threshold`
			`api_messages = [{"role": "user", "content": "x"}] * 10`

			`should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)`
			`assert should_skip`

			`def test_small_session_400_persists_normally(self):`
			`"""Status 400 + small session should still persist."""`
			`status_code = 400`
			`approx_tokens = 5000 # < 50000`
			`api_messages = [{"role": "user", "content": "x"}] * 10 # < 80`

			`should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)`
			`assert not should_skip`

			`def test_non_400_error_persists_normally(self):`
			`"""Non-400 errors should always persist normally."""`
			`status_code = 401 # Auth error`
			`approx_tokens = 100000 # Large session, but not a 400`
			`api_messages = [{"role": "user", "content": "x"}] * 100`

			`should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80)`
			`assert not should_skip`