- Remove _extract_raw_output: persist content verbatim (fixes size mismatch bug) - Drop import aliases: import from budget_config directly, one canonical name - BudgetConfig param on maybe_persist_tool_result and enforce_turn_budget - read_file: limit=None signature, pre-read guard fires only when limit omitted (256KB) - Unify binary extensions: file_operations.py imports from binary_extensions.py - Exclude .pdf and .svg from binary set (text-based, agents may inspect) - Remove redundant outer try/except in eval path (internal fallback handles it) - Fix broken tests: update assertion strings for new persistence format - Module-level constants: _PRE_READ_MAX_BYTES, _DEFAULT_READ_LIMIT - Remove redundant pathlib import (Path already at module level) - Update spec.md with IMPLEMENTED annotations and design decisions
205 lines
7.0 KiB
Python
205 lines
7.0 KiB
Python
"""Tool result persistence -- preserves large outputs instead of truncating.
|
|
|
|
Defense against context-window overflow operates at three levels:
|
|
|
|
1. **Per-tool output cap** (inside each tool): Tools like search_files
|
|
pre-truncate their own output before returning. This is the first line
|
|
of defense and the only one the tool author controls.
|
|
|
|
2. **Per-result persistence** (maybe_persist_tool_result): After a tool
|
|
returns, if its output exceeds the tool's registered threshold
|
|
(registry.get_max_result_size), the full output is written INTO THE
|
|
SANDBOX at /tmp/hermes-results/{tool_use_id}.txt via env.execute().
|
|
The in-context content is replaced with a preview + file path reference.
|
|
The model can read_file to access the full output on any backend.
|
|
|
|
3. **Per-turn aggregate budget** (enforce_turn_budget): After all tool
|
|
results in a single assistant turn are collected, if the total exceeds
|
|
MAX_TURN_BUDGET_CHARS (200K), the largest non-persisted results are
|
|
spilled to disk until the aggregate is under budget. This catches cases
|
|
where many medium-sized results combine to overflow context.
|
|
"""
|
|
|
|
import logging
|
|
import uuid
|
|
|
|
from tools.budget_config import (
|
|
DEFAULT_PREVIEW_SIZE_CHARS,
|
|
BudgetConfig,
|
|
DEFAULT_BUDGET,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
PERSISTED_OUTPUT_TAG = "<persisted-output>"
|
|
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
|
|
STORAGE_DIR = "/tmp/hermes-results"
|
|
HEREDOC_MARKER = "HERMES_PERSIST_EOF"
|
|
_BUDGET_TOOL_NAME = "__budget_enforcement__"
|
|
|
|
|
|
def generate_preview(content: str, max_chars: int = DEFAULT_PREVIEW_SIZE_CHARS) -> tuple[str, bool]:
|
|
"""Truncate at last newline within max_chars. Returns (preview, has_more)."""
|
|
if len(content) <= max_chars:
|
|
return content, False
|
|
truncated = content[:max_chars]
|
|
last_nl = truncated.rfind("\n")
|
|
if last_nl > max_chars // 2:
|
|
truncated = truncated[:last_nl + 1]
|
|
return truncated, True
|
|
|
|
|
|
def _heredoc_marker(content: str) -> str:
|
|
"""Return a heredoc delimiter that doesn't collide with content."""
|
|
if HEREDOC_MARKER not in content:
|
|
return HEREDOC_MARKER
|
|
return f"HERMES_PERSIST_{uuid.uuid4().hex[:8]}"
|
|
|
|
|
|
def _write_to_sandbox(content: str, remote_path: str, env) -> bool:
|
|
"""Write content into the sandbox via env.execute(). Returns True on success."""
|
|
marker = _heredoc_marker(content)
|
|
cmd = (
|
|
f"mkdir -p {STORAGE_DIR} && cat > {remote_path} << '{marker}'\n"
|
|
f"{content}\n"
|
|
f"{marker}"
|
|
)
|
|
result = env.execute(cmd, timeout=30)
|
|
return result.get("returncode", 1) == 0
|
|
|
|
|
|
def _build_persisted_message(
|
|
preview: str,
|
|
has_more: bool,
|
|
original_size: int,
|
|
file_path: str,
|
|
) -> str:
|
|
"""Build the <persisted-output> replacement block."""
|
|
size_kb = original_size / 1024
|
|
if size_kb >= 1024:
|
|
size_str = f"{size_kb / 1024:.1f} MB"
|
|
else:
|
|
size_str = f"{size_kb:.1f} KB"
|
|
|
|
msg = f"{PERSISTED_OUTPUT_TAG}\n"
|
|
msg += f"This tool result was too large ({original_size:,} characters, {size_str}).\n"
|
|
msg += f"Full output saved to: {file_path}\n"
|
|
msg += "Use the read_file tool with offset and limit to access specific sections of this output.\n\n"
|
|
msg += f"Preview (first {len(preview)} chars):\n"
|
|
msg += preview
|
|
if has_more:
|
|
msg += "\n..."
|
|
msg += f"\n{PERSISTED_OUTPUT_CLOSING_TAG}"
|
|
return msg
|
|
|
|
|
|
def maybe_persist_tool_result(
|
|
content: str,
|
|
tool_name: str,
|
|
tool_use_id: str,
|
|
env=None,
|
|
config: BudgetConfig = DEFAULT_BUDGET,
|
|
threshold: int | float | None = None,
|
|
) -> str:
|
|
"""Layer 2: persist oversized result into the sandbox, return preview + path.
|
|
|
|
Writes via env.execute() so the file is accessible from any backend
|
|
(local, Docker, SSH, Modal, Daytona). Falls back to inline truncation
|
|
if write fails or no env is available.
|
|
|
|
Args:
|
|
content: Raw tool result string.
|
|
tool_name: Name of the tool (used for threshold lookup).
|
|
tool_use_id: Unique ID for this tool call (used as filename).
|
|
env: The active BaseEnvironment instance, or None.
|
|
config: BudgetConfig controlling thresholds and preview size.
|
|
threshold: Explicit override; takes precedence over config resolution.
|
|
|
|
Returns:
|
|
Original content if small, or <persisted-output> replacement.
|
|
"""
|
|
effective_threshold = threshold if threshold is not None else config.resolve_threshold(tool_name)
|
|
|
|
if effective_threshold == float("inf"):
|
|
return content
|
|
|
|
if len(content) <= effective_threshold:
|
|
return content
|
|
|
|
remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
|
|
preview, has_more = generate_preview(content, max_chars=config.preview_size)
|
|
|
|
if env is not None:
|
|
try:
|
|
if _write_to_sandbox(content, remote_path, env):
|
|
logger.info(
|
|
"Persisted large tool result: %s (%s, %d chars -> %s)",
|
|
tool_name, tool_use_id, len(content), remote_path,
|
|
)
|
|
return _build_persisted_message(preview, has_more, len(content), remote_path)
|
|
except Exception as exc:
|
|
logger.warning("Sandbox write failed for %s: %s", tool_use_id, exc)
|
|
|
|
logger.info(
|
|
"Inline-truncating large tool result: %s (%d chars, no sandbox write)",
|
|
tool_name, len(content),
|
|
)
|
|
return (
|
|
f"{preview}\n\n"
|
|
f"[Truncated: tool response was {len(content):,} chars. "
|
|
f"Full output could not be saved to sandbox.]"
|
|
)
|
|
|
|
|
|
def enforce_turn_budget(
|
|
tool_messages: list[dict],
|
|
env=None,
|
|
config: BudgetConfig = DEFAULT_BUDGET,
|
|
) -> list[dict]:
|
|
"""Layer 3: enforce aggregate budget across all tool results in a turn.
|
|
|
|
If total chars exceed budget, persist the largest non-persisted results
|
|
first (via sandbox write) until under budget. Already-persisted results
|
|
are skipped.
|
|
|
|
Mutates the list in-place and returns it.
|
|
"""
|
|
candidates = []
|
|
total_size = 0
|
|
for i, msg in enumerate(tool_messages):
|
|
content = msg.get("content", "")
|
|
size = len(content)
|
|
total_size += size
|
|
if PERSISTED_OUTPUT_TAG not in content:
|
|
candidates.append((i, size))
|
|
|
|
if total_size <= config.turn_budget:
|
|
return tool_messages
|
|
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
for idx, size in candidates:
|
|
if total_size <= config.turn_budget:
|
|
break
|
|
msg = tool_messages[idx]
|
|
content = msg["content"]
|
|
tool_use_id = msg.get("tool_call_id", f"budget_{idx}")
|
|
|
|
replacement = maybe_persist_tool_result(
|
|
content=content,
|
|
tool_name=_BUDGET_TOOL_NAME,
|
|
tool_use_id=tool_use_id,
|
|
env=env,
|
|
config=config,
|
|
threshold=0,
|
|
)
|
|
if replacement != content:
|
|
total_size -= size
|
|
total_size += len(replacement)
|
|
tool_messages[idx]["content"] = replacement
|
|
logger.info(
|
|
"Budget enforcement: persisted tool result %s (%d chars)",
|
|
tool_use_id, size,
|
|
)
|
|
|
|
return tool_messages
|