* refactor: re-architect tests to mirror the codebase
* Update tests.yml
* fix: add missing tool_error imports after registry refactor
* fix(tests): replace patch.dict with monkeypatch to prevent env var leaks under xdist
patch.dict(os.environ) can leak TERMINAL_ENV across xdist workers,
causing test_code_execution tests to hit the Modal remote path.
* fix(tests): fix update_check and telegram xdist failures
- test_update_check: replace patch("hermes_cli.banner.os.getenv") with
monkeypatch.setenv("HERMES_HOME") — banner.py no longer imports os
directly, it uses get_hermes_home() from hermes_constants.
- test_telegram_conflict/approval_buttons: provide real exception classes
for telegram.error mock (NetworkError, TimedOut, BadRequest) so the
except clause in connect() doesn't fail with "catching classes that do
not inherit from BaseException" when xdist pollutes sys.modules.
* fix(tests): accept unavailable_models kwarg in _prompt_model_selection mock
203 lines
8.8 KiB
Python
203 lines
8.8 KiB
Python
"""Tests for context compression persistence in the gateway.
|
|
|
|
Verifies that when context compression fires during run_conversation(),
|
|
the compressed messages are properly persisted to both SQLite (via the
|
|
agent) and JSONL (via the gateway).
|
|
|
|
Bug scenario (pre-fix):
|
|
1. Gateway loads 200-message history, passes to agent
|
|
2. Agent's run_conversation() compresses to ~30 messages mid-run
|
|
3. _compress_context() resets _last_flushed_db_idx = 0
|
|
4. On exit, _flush_messages_to_session_db() calculates:
|
|
flush_from = max(len(conversation_history=200), _last_flushed_db_idx=0) = 200
|
|
5. messages[200:] is empty (only ~30 messages after compression)
|
|
6. Nothing written to new session's SQLite — compressed context lost
|
|
7. Gateway's history_offset was still 200, producing empty new_messages
|
|
8. Fallback wrote only user/assistant pair — summary lost
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Part 1: Agent-side — _flush_messages_to_session_db after compression
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFlushAfterCompression:
|
|
"""Verify that compressed messages are flushed to the new session's SQLite
|
|
even when conversation_history (from the original session) is longer than
|
|
the compressed messages list."""
|
|
|
|
def _make_agent(self, session_db):
|
|
with patch.dict(os.environ, {"OPENROUTER_API_KEY": "test-key"}):
|
|
from run_agent import AIAgent
|
|
agent = AIAgent(
|
|
model="test/model",
|
|
quiet_mode=True,
|
|
session_db=session_db,
|
|
session_id="original-session",
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
return agent
|
|
|
|
def test_flush_after_compression_with_long_history(self):
|
|
"""The actual bug: conversation_history longer than compressed messages.
|
|
|
|
Before the fix, flush_from = max(len(conversation_history), 0) = 200,
|
|
but messages only has ~30 entries, so messages[200:] is empty.
|
|
After the fix, conversation_history is cleared to None after compression,
|
|
so flush_from = max(0, 0) = 0, and ALL compressed messages are written.
|
|
"""
|
|
from hermes_state import SessionDB
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
db_path = Path(tmpdir) / "test.db"
|
|
db = SessionDB(db_path=db_path)
|
|
|
|
agent = self._make_agent(db)
|
|
|
|
# Simulate the original long history (200 messages)
|
|
original_history = [
|
|
{"role": "user" if i % 2 == 0 else "assistant",
|
|
"content": f"message {i}"}
|
|
for i in range(200)
|
|
]
|
|
|
|
# First, flush original messages to the original session
|
|
agent._flush_messages_to_session_db(original_history, [])
|
|
original_rows = db.get_messages("original-session")
|
|
assert len(original_rows) == 200
|
|
|
|
# Now simulate compression: new session, reset idx, shorter messages
|
|
agent.session_id = "compressed-session"
|
|
db.create_session(session_id="compressed-session", source="test")
|
|
agent._last_flushed_db_idx = 0
|
|
|
|
# The compressed messages (summary + tail + new turn)
|
|
compressed_messages = [
|
|
{"role": "user", "content": "[CONTEXT COMPACTION] Summary of work..."},
|
|
{"role": "user", "content": "What should we do next?"},
|
|
{"role": "assistant", "content": "Let me check..."},
|
|
{"role": "user", "content": "new question"},
|
|
{"role": "assistant", "content": "new answer"},
|
|
]
|
|
|
|
# THE BUG: passing the original history as conversation_history
|
|
# causes flush_from = max(200, 0) = 200, skipping everything.
|
|
# After the fix, conversation_history should be None.
|
|
agent._flush_messages_to_session_db(compressed_messages, None)
|
|
|
|
new_rows = db.get_messages("compressed-session")
|
|
assert len(new_rows) == 5, (
|
|
f"Expected 5 compressed messages in new session, got {len(new_rows)}. "
|
|
f"Compression persistence bug: messages not written to SQLite."
|
|
)
|
|
|
|
def test_flush_with_stale_history_loses_messages(self):
|
|
"""Demonstrates the bug condition: stale conversation_history causes data loss."""
|
|
from hermes_state import SessionDB
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
db_path = Path(tmpdir) / "test.db"
|
|
db = SessionDB(db_path=db_path)
|
|
|
|
agent = self._make_agent(db)
|
|
|
|
# Simulate compression reset
|
|
agent.session_id = "new-session"
|
|
db.create_session(session_id="new-session", source="test")
|
|
agent._last_flushed_db_idx = 0
|
|
|
|
compressed = [
|
|
{"role": "user", "content": "summary"},
|
|
{"role": "assistant", "content": "continuing..."},
|
|
]
|
|
|
|
# Bug: passing a conversation_history longer than compressed messages
|
|
stale_history = [{"role": "user", "content": f"msg{i}"} for i in range(100)]
|
|
agent._flush_messages_to_session_db(compressed, stale_history)
|
|
|
|
rows = db.get_messages("new-session")
|
|
# With the stale history, flush_from = max(100, 0) = 100
|
|
# But compressed only has 2 entries → messages[100:] = empty
|
|
assert len(rows) == 0, (
|
|
"Expected 0 messages with stale conversation_history "
|
|
"(this test verifies the bug condition exists)"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Part 2: Gateway-side — history_offset after session split
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestGatewayHistoryOffsetAfterSplit:
|
|
"""Verify that when the agent creates a new session during compression,
|
|
the gateway uses history_offset=0 so all compressed messages are written
|
|
to the JSONL transcript."""
|
|
|
|
def test_history_offset_zero_on_session_split(self):
|
|
"""When agent.session_id differs from the original, history_offset must be 0."""
|
|
# This tests the logic in gateway/run.py run_sync():
|
|
# _session_was_split = agent.session_id != session_id
|
|
# _effective_history_offset = 0 if _session_was_split else len(agent_history)
|
|
|
|
original_session_id = "session-abc"
|
|
agent_session_id = "session-compressed-xyz" # Different = compression happened
|
|
agent_history_len = 200
|
|
|
|
# Simulate the gateway's offset calculation (post-fix)
|
|
_session_was_split = (agent_session_id != original_session_id)
|
|
_effective_history_offset = 0 if _session_was_split else agent_history_len
|
|
|
|
assert _session_was_split is True
|
|
assert _effective_history_offset == 0
|
|
|
|
def test_history_offset_preserved_without_split(self):
|
|
"""When no compression happened, history_offset is the original length."""
|
|
session_id = "session-abc"
|
|
agent_session_id = "session-abc" # Same = no compression
|
|
agent_history_len = 200
|
|
|
|
_session_was_split = (agent_session_id != session_id)
|
|
_effective_history_offset = 0 if _session_was_split else agent_history_len
|
|
|
|
assert _session_was_split is False
|
|
assert _effective_history_offset == 200
|
|
|
|
def test_new_messages_extraction_after_split(self):
|
|
"""After compression with offset=0, new_messages should be ALL agent messages."""
|
|
# Simulates the gateway's new_messages calculation
|
|
agent_messages = [
|
|
{"role": "user", "content": "[CONTEXT COMPACTION] Summary..."},
|
|
{"role": "user", "content": "recent question"},
|
|
{"role": "assistant", "content": "recent answer"},
|
|
{"role": "user", "content": "new question"},
|
|
{"role": "assistant", "content": "new answer"},
|
|
]
|
|
history_offset = 0 # After fix: 0 on session split
|
|
|
|
new_messages = agent_messages[history_offset:] if len(agent_messages) > history_offset else []
|
|
assert len(new_messages) == 5, (
|
|
f"Expected all 5 messages with offset=0, got {len(new_messages)}"
|
|
)
|
|
|
|
def test_new_messages_empty_with_stale_offset(self):
|
|
"""Demonstrates the bug: stale offset produces empty new_messages."""
|
|
agent_messages = [
|
|
{"role": "user", "content": "summary"},
|
|
{"role": "assistant", "content": "answer"},
|
|
]
|
|
# Bug: offset is the pre-compression history length
|
|
history_offset = 200
|
|
|
|
new_messages = agent_messages[history_offset:] if len(agent_messages) > history_offset else []
|
|
assert len(new_messages) == 0, (
|
|
"Expected 0 messages with stale offset=200 (demonstrates the bug)"
|
|
)
|