Files
hermes-agent/tests/tools/test_tool_result_storage.py
Teknium 3696c74bfb fix: preserve existing thresholds, remove pre-read byte guard
- DEFAULT_RESULT_SIZE_CHARS: 50K -> 100K (match current _LARGE_RESULT_CHARS)
- DEFAULT_PREVIEW_SIZE_CHARS: 2K -> 1.5K (match current _LARGE_RESULT_PREVIEW_CHARS)
- Per-tool overrides all set to 100K (terminal, execute_code, search_files)
- Remove pre-read byte guard (no behavioral regression vs current main)
- Revert limit signature change to int=500 (match current default)
- Restore original read_file schema description
- Update test assertions to match 100K thresholds
2026-04-08 02:24:32 -07:00

473 lines
18 KiB
Python

"""Tests for tools/tool_result_storage.py -- 3-layer tool result persistence."""
import pytest
from unittest.mock import MagicMock, patch
from tools.budget_config import (
DEFAULT_RESULT_SIZE_CHARS,
DEFAULT_TURN_BUDGET_CHARS,
DEFAULT_PREVIEW_SIZE_CHARS,
BudgetConfig,
)
from tools.tool_result_storage import (
HEREDOC_MARKER,
PERSISTED_OUTPUT_TAG,
PERSISTED_OUTPUT_CLOSING_TAG,
STORAGE_DIR,
_build_persisted_message,
_heredoc_marker,
_write_to_sandbox,
enforce_turn_budget,
generate_preview,
maybe_persist_tool_result,
)
# ── generate_preview ──────────────────────────────────────────────────
class TestGeneratePreview:
def test_short_content_unchanged(self):
text = "short result"
preview, has_more = generate_preview(text)
assert preview == text
assert has_more is False
def test_long_content_truncated(self):
text = "x" * 5000
preview, has_more = generate_preview(text, max_chars=2000)
assert len(preview) <= 2000
assert has_more is True
def test_truncates_at_newline_boundary(self):
# 1500 chars + newline + 600 chars (past halfway)
text = "a" * 1500 + "\n" + "b" * 600
preview, has_more = generate_preview(text, max_chars=2000)
assert preview == "a" * 1500 + "\n"
assert has_more is True
def test_ignores_early_newline(self):
# Newline at position 100, well before halfway of 2000
text = "a" * 100 + "\n" + "b" * 3000
preview, has_more = generate_preview(text, max_chars=2000)
assert len(preview) == 2000
assert has_more is True
def test_empty_content(self):
preview, has_more = generate_preview("")
assert preview == ""
assert has_more is False
def test_exact_boundary(self):
text = "x" * DEFAULT_PREVIEW_SIZE_CHARS
preview, has_more = generate_preview(text)
assert preview == text
assert has_more is False
# ── _heredoc_marker ───────────────────────────────────────────────────
class TestHeredocMarker:
def test_default_marker_when_no_collision(self):
assert _heredoc_marker("normal content") == HEREDOC_MARKER
def test_uuid_marker_on_collision(self):
content = f"some text with {HEREDOC_MARKER} embedded"
marker = _heredoc_marker(content)
assert marker != HEREDOC_MARKER
assert marker.startswith("HERMES_PERSIST_")
assert marker not in content
# ── _write_to_sandbox ─────────────────────────────────────────────────
class TestWriteToSandbox:
def test_success(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
result = _write_to_sandbox("hello world", "/tmp/hermes-results/abc.txt", env)
assert result is True
env.execute.assert_called_once()
cmd = env.execute.call_args[0][0]
assert "mkdir -p" in cmd
assert "hello world" in cmd
assert HEREDOC_MARKER in cmd
def test_failure_returns_false(self):
env = MagicMock()
env.execute.return_value = {"output": "error", "returncode": 1}
result = _write_to_sandbox("content", "/tmp/hermes-results/abc.txt", env)
assert result is False
def test_heredoc_collision_uses_uuid_marker(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = f"text with {HEREDOC_MARKER} inside"
_write_to_sandbox(content, "/tmp/hermes-results/abc.txt", env)
cmd = env.execute.call_args[0][0]
# The default marker should NOT be used as the delimiter
lines = cmd.split("\n")
# The first and last lines contain the actual delimiter
assert HEREDOC_MARKER not in lines[0].split("<<")[1]
def test_timeout_passed(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
_write_to_sandbox("content", "/tmp/hermes-results/abc.txt", env)
assert env.execute.call_args[1]["timeout"] == 30
# ── _build_persisted_message ──────────────────────────────────────────
class TestBuildPersistedMessage:
def test_structure(self):
msg = _build_persisted_message(
preview="first 100 chars...",
has_more=True,
original_size=50_000,
file_path="/tmp/hermes-results/test123.txt",
)
assert msg.startswith(PERSISTED_OUTPUT_TAG)
assert msg.endswith(PERSISTED_OUTPUT_CLOSING_TAG)
assert "50,000 characters" in msg
assert "/tmp/hermes-results/test123.txt" in msg
assert "read_file" in msg
assert "first 100 chars..." in msg
assert "..." in msg # has_more indicator
def test_no_ellipsis_when_complete(self):
msg = _build_persisted_message(
preview="complete content",
has_more=False,
original_size=16,
file_path="/tmp/hermes-results/x.txt",
)
# Should not have the trailing "..." indicator before closing tag
lines = msg.strip().split("\n")
assert lines[-2] != "..."
def test_large_size_shows_mb(self):
msg = _build_persisted_message(
preview="x",
has_more=True,
original_size=2_000_000,
file_path="/tmp/hermes-results/big.txt",
)
assert "MB" in msg
# ── maybe_persist_tool_result ─────────────────────────────────────────
class TestMaybePersistToolResult:
def test_below_threshold_returns_unchanged(self):
content = "small result"
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_123",
env=None,
threshold=50_000,
)
assert result == content
def test_above_threshold_with_env_persists(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_456",
env=env,
threshold=30_000,
)
assert PERSISTED_OUTPUT_TAG in result
assert "tc_456.txt" in result
assert len(result) < len(content)
env.execute.assert_called_once()
def test_persists_full_content_as_is(self):
"""Content is persisted verbatim — no JSON extraction."""
import json
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
raw = "line1\nline2\n" * 5_000
content = json.dumps({"output": raw, "exit_code": 0, "error": None})
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_json",
env=env,
threshold=30_000,
)
assert PERSISTED_OUTPUT_TAG in result
# The heredoc written to sandbox should contain the full JSON blob
cmd = env.execute.call_args[0][0]
assert '"exit_code"' in cmd
def test_above_threshold_no_env_truncates_inline(self):
content = "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_789",
env=None,
threshold=30_000,
)
assert PERSISTED_OUTPUT_TAG not in result
assert "Truncated" in result
assert len(result) < len(content)
def test_env_write_failure_falls_back_to_truncation(self):
env = MagicMock()
env.execute.return_value = {"output": "disk full", "returncode": 1}
content = "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_fail",
env=env,
threshold=30_000,
)
assert PERSISTED_OUTPUT_TAG not in result
assert "Truncated" in result
def test_env_execute_exception_falls_back(self):
env = MagicMock()
env.execute.side_effect = RuntimeError("connection lost")
content = "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_exc",
env=env,
threshold=30_000,
)
assert "Truncated" in result
def test_read_file_never_persisted(self):
"""read_file has threshold=inf, should never be persisted."""
env = MagicMock()
content = "x" * 200_000
result = maybe_persist_tool_result(
content=content,
tool_name="read_file",
tool_use_id="tc_rf",
env=env,
threshold=float("inf"),
)
assert result == content
env.execute.assert_not_called()
def test_uses_registry_threshold_when_not_provided(self):
"""When threshold=None, looks up from registry."""
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = "x" * 60_000
mock_registry = MagicMock()
mock_registry.get_max_result_size.return_value = 30_000
with patch("tools.registry.registry", mock_registry):
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_reg",
env=env,
threshold=None,
)
# Should have persisted since 60K > 30K
assert PERSISTED_OUTPUT_TAG in result or "Truncated" in result
def test_unicode_content_survives(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = "日本語テスト " * 10_000 # ~60K chars of unicode
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_uni",
env=env,
threshold=30_000,
)
assert PERSISTED_OUTPUT_TAG in result
# Preview should contain unicode
assert "日本語テスト" in result
def test_empty_content_returns_unchanged(self):
result = maybe_persist_tool_result(
content="",
tool_name="terminal",
tool_use_id="tc_empty",
env=None,
threshold=30_000,
)
assert result == ""
def test_whitespace_only_below_threshold(self):
content = " " * 100
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_ws",
env=None,
threshold=30_000,
)
assert result == content
def test_file_path_uses_tool_use_id(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="unique_id_abc",
env=env,
threshold=30_000,
)
assert "unique_id_abc.txt" in result
def test_preview_included_in_persisted_output(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
# Create content with a distinctive start
content = "DISTINCTIVE_START_MARKER" + "x" * 60_000
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_prev",
env=env,
threshold=30_000,
)
assert "DISTINCTIVE_START_MARKER" in result
def test_threshold_zero_forces_persist(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
content = "even short content"
result = maybe_persist_tool_result(
content=content,
tool_name="terminal",
tool_use_id="tc_zero",
env=env,
threshold=0,
)
# Any non-empty content with threshold=0 should be persisted
assert PERSISTED_OUTPUT_TAG in result
# ── enforce_turn_budget ───────────────────────────────────────────────
class TestEnforceTurnBudget:
def test_under_budget_no_changes(self):
msgs = [
{"role": "tool", "tool_call_id": "t1", "content": "small"},
{"role": "tool", "tool_call_id": "t2", "content": "also small"},
]
result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000))
assert result[0]["content"] == "small"
assert result[1]["content"] == "also small"
def test_over_budget_largest_persisted_first(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
msgs = [
{"role": "tool", "tool_call_id": "t1", "content": "a" * 80_000},
{"role": "tool", "tool_call_id": "t2", "content": "b" * 130_000},
]
# Total 210K > 200K budget
enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000))
# The larger one (130K) should be persisted first
assert PERSISTED_OUTPUT_TAG in msgs[1]["content"]
def test_already_persisted_results_skipped(self):
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
msgs = [
{"role": "tool", "tool_call_id": "t1",
"content": f"{PERSISTED_OUTPUT_TAG}\nalready persisted\n{PERSISTED_OUTPUT_CLOSING_TAG}"},
{"role": "tool", "tool_call_id": "t2", "content": "x" * 250_000},
]
enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000))
# t1 should be untouched (already persisted)
assert msgs[0]["content"].startswith(PERSISTED_OUTPUT_TAG)
# t2 should be persisted
assert PERSISTED_OUTPUT_TAG in msgs[1]["content"]
def test_medium_result_regression(self):
"""6 results of 42K chars each (252K total) — each under 100K default
threshold but aggregate exceeds 200K budget. L3 should persist."""
env = MagicMock()
env.execute.return_value = {"output": "", "returncode": 0}
msgs = [
{"role": "tool", "tool_call_id": f"t{i}", "content": "x" * 42_000}
for i in range(6)
]
enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000))
# At least some results should be persisted to get under 200K
persisted_count = sum(
1 for m in msgs if PERSISTED_OUTPUT_TAG in m["content"]
)
assert persisted_count >= 2 # Need to shed at least ~52K
def test_no_env_falls_back_to_truncation(self):
msgs = [
{"role": "tool", "tool_call_id": "t1", "content": "x" * 250_000},
]
enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000))
# Should be truncated (no sandbox available)
assert "Truncated" in msgs[0]["content"] or PERSISTED_OUTPUT_TAG in msgs[0]["content"]
def test_returns_same_list(self):
msgs = [{"role": "tool", "tool_call_id": "t1", "content": "ok"}]
result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000))
assert result is msgs
def test_empty_messages(self):
result = enforce_turn_budget([], env=None, config=BudgetConfig(turn_budget=200_000))
assert result == []
# ── Per-tool threshold integration ────────────────────────────────────
class TestPerToolThresholds:
"""Verify registry wiring for per-tool thresholds."""
def test_registry_has_get_max_result_size(self):
from tools.registry import registry
assert hasattr(registry, "get_max_result_size")
def test_default_threshold(self):
from tools.registry import registry
# Unknown tool should return the default
val = registry.get_max_result_size("nonexistent_tool_xyz")
assert val == DEFAULT_RESULT_SIZE_CHARS
def test_terminal_threshold(self):
from tools.registry import registry
# Trigger import of terminal_tool to register the tool
try:
import tools.terminal_tool # noqa: F401
val = registry.get_max_result_size("terminal")
assert val == 100_000
except ImportError:
pytest.skip("terminal_tool not importable in test env")
def test_read_file_never_persisted(self):
from tools.registry import registry
try:
import tools.file_tools # noqa: F401
val = registry.get_max_result_size("read_file")
assert val == float("inf")
except ImportError:
pytest.skip("file_tools not importable in test env")
def test_search_files_threshold(self):
from tools.registry import registry
try:
import tools.file_tools # noqa: F401
val = registry.get_max_result_size("search_files")
assert val == 100_000
except ImportError:
pytest.skip("file_tools not importable in test env")