feat(budget): make tool result persistence thresholds configurable

Add BudgetConfig dataclass to centralize and make overridable the
hardcoded constants (50K per-result, 200K per-turn, 2K preview) that
control when tool outputs get persisted to sandbox. Configurable at
the RL environment level via HermesAgentEnvConfig fields, threaded
through HermesAgentLoop to the storage layer.

Resolution: pinned (read_file=inf) > env config overrides > registry
per-tool > default. CLI override: --env.turn_budget_chars 80000
This commit is contained in:
alt-glitch
2026-04-07 22:31:06 -07:00
committed by Teknium
parent 65e24c942e
commit 77c5bc9da9
9 changed files with 131 additions and 9 deletions

View File

@@ -140,6 +140,7 @@ class HermesAgentLoop:
temperature: float = 1.0,
max_tokens: Optional[int] = None,
extra_body: Optional[Dict[str, Any]] = None,
budget_config: Optional["BudgetConfig"] = None,
):
"""
Initialize the agent loop.
@@ -156,7 +157,11 @@ class HermesAgentLoop:
extra_body: Extra parameters passed to the OpenAI client's create() call.
Used for OpenRouter provider preferences, transforms, etc.
e.g. {"provider": {"ignore": ["DeepInfra"]}}
budget_config: Tool result persistence budget. Controls per-tool
thresholds, per-turn aggregate budget, and preview size.
If None, uses DEFAULT_BUDGET (current hardcoded values).
"""
from tools.budget_config import DEFAULT_BUDGET
self.server = server
self.tool_schemas = tool_schemas
self.valid_tool_names = valid_tool_names
@@ -165,6 +170,7 @@ class HermesAgentLoop:
self.temperature = temperature
self.max_tokens = max_tokens
self.extra_body = extra_body
self.budget_config = budget_config or DEFAULT_BUDGET
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
"""
@@ -455,6 +461,8 @@ class HermesAgentLoop:
tool_name=tool_name,
tool_use_id=tc_id,
env=get_active_env(self.task_id),
threshold=self.budget_config.resolve_threshold(tool_name),
preview_size=self.budget_config.preview_size,
)
except Exception:
pass # Persistence is best-effort in eval path
@@ -470,7 +478,12 @@ class HermesAgentLoop:
try:
num_tcs = len(assistant_msg.tool_calls)
if num_tcs > 0:
enforce_turn_budget(messages[-num_tcs:], env=get_active_env(self.task_id))
enforce_turn_budget(
messages[-num_tcs:],
env=get_active_env(self.task_id),
budget=self.budget_config.turn_budget,
preview_size=self.budget_config.preview_size,
)
except Exception:
pass

View File

@@ -1048,6 +1048,7 @@ class AgenticOPDEnv(HermesAgentBaseEnv):
temperature=0.0,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)

View File

@@ -541,6 +541,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
else:
@@ -553,6 +554,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)

View File

@@ -549,6 +549,7 @@ class YCBenchEvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)

View File

@@ -62,6 +62,11 @@ from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
from tools.budget_config import (
DEFAULT_RESULT_SIZE_CHARS,
DEFAULT_TURN_BUDGET_CHARS,
DEFAULT_PREVIEW_SIZE_CHARS,
)
# Import hermes-agent toolset infrastructure
from model_tools import get_tool_definitions
@@ -160,6 +165,32 @@ class HermesAgentEnvConfig(BaseEnvConfig):
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
)
# --- Tool result budget ---
# Defaults imported from tools.budget_config (single source of truth).
default_result_size_chars: int = Field(
default=DEFAULT_RESULT_SIZE_CHARS,
description="Default per-tool threshold (chars) for persisting large results "
"to sandbox. Results exceeding this are written to /tmp/hermes-results/ "
"and replaced with a preview. Per-tool registry values take precedence "
"unless overridden via tool_result_overrides.",
)
turn_budget_chars: int = Field(
default=DEFAULT_TURN_BUDGET_CHARS,
description="Aggregate char budget per assistant turn. If all tool results "
"in a single turn exceed this, the largest are persisted to disk first.",
)
preview_size_chars: int = Field(
default=DEFAULT_PREVIEW_SIZE_CHARS,
description="Size of the inline preview shown after a tool result is persisted.",
)
tool_result_overrides: Optional[Dict[str, int]] = Field(
default=None,
description="Per-tool threshold overrides (chars). Keys are tool names, "
"values are char thresholds. Overrides both the default and registry "
"per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. "
"Note: read_file is pinned to infinity and cannot be overridden.",
)
# --- Provider-specific parameters ---
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
@@ -176,6 +207,16 @@ class HermesAgentEnvConfig(BaseEnvConfig):
"transforms, and other provider-specific settings.",
)
def build_budget_config(self):
"""Build a BudgetConfig from env config fields."""
from tools.budget_config import BudgetConfig
return BudgetConfig(
default_result_size=self.default_result_size_chars,
turn_budget=self.turn_budget_chars,
preview_size=self.preview_size_chars,
tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {},
)
class HermesAgentBaseEnv(BaseEnv):
"""
@@ -490,6 +531,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
except NotImplementedError:
@@ -507,6 +549,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
else:
@@ -520,6 +563,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)

View File

@@ -472,6 +472,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
temperature=0.0, # Deterministic for eval
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)

52
tools/budget_config.py Normal file
View File

@@ -0,0 +1,52 @@
"""Configurable budget constants for tool result persistence.
Overridable at the RL environment level via HermesAgentEnvConfig fields.
Per-tool resolution: pinned > config overrides > registry > default.
"""
from dataclasses import dataclass, field
from typing import Dict
# Tools whose thresholds must never be overridden.
# read_file=inf prevents infinite persist->read->persist loops.
PINNED_THRESHOLDS: Dict[str, float] = {
"read_file": float("inf"),
}
# Defaults matching the current hardcoded values in tool_result_storage.py.
# Kept here as the single source of truth; tool_result_storage.py imports these.
DEFAULT_RESULT_SIZE_CHARS: int = 50_000
DEFAULT_TURN_BUDGET_CHARS: int = 200_000
DEFAULT_PREVIEW_SIZE_CHARS: int = 2_000
@dataclass(frozen=True)
class BudgetConfig:
"""Immutable budget constants for the 3-layer tool result persistence system.
Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars.
Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool
results in a single assistant turn.
Preview: preview_size -> inline snippet size after persistence.
"""
default_result_size: int = DEFAULT_RESULT_SIZE_CHARS
turn_budget: int = DEFAULT_TURN_BUDGET_CHARS
preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS
tool_overrides: Dict[str, int] = field(default_factory=dict)
def resolve_threshold(self, tool_name: str) -> int | float:
"""Resolve the persistence threshold for a tool.
Priority: pinned -> tool_overrides -> registry per-tool -> default.
"""
if tool_name in PINNED_THRESHOLDS:
return PINNED_THRESHOLDS[tool_name]
if tool_name in self.tool_overrides:
return self.tool_overrides[tool_name]
from tools.registry import registry
return registry.get_max_result_size(tool_name, default=self.default_result_size)
# Default config -- matches current hardcoded behavior exactly.
DEFAULT_BUDGET = BudgetConfig()

View File

@@ -169,12 +169,14 @@ class ToolRegistry:
# Query helpers (replace redundant dicts in model_tools.py)
# ------------------------------------------------------------------
def get_max_result_size(self, name: str) -> int | float:
"""Return per-tool max result size, or global default."""
from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
def get_max_result_size(self, name: str, default: int | float | None = None) -> int | float:
"""Return per-tool max result size, or *default* (or global default)."""
entry = self._tools.get(name)
if entry and entry.max_result_size_chars is not None:
return entry.max_result_size_chars
if default is not None:
return default
from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
return DEFAULT_MAX_RESULT_SIZE_CHARS
def get_all_tool_names(self) -> List[str]:

View File

@@ -24,11 +24,13 @@ import json
import logging
import uuid
logger = logging.getLogger(__name__)
from tools.budget_config import (
DEFAULT_RESULT_SIZE_CHARS as DEFAULT_MAX_RESULT_SIZE_CHARS,
DEFAULT_TURN_BUDGET_CHARS as MAX_TURN_BUDGET_CHARS,
DEFAULT_PREVIEW_SIZE_CHARS as PREVIEW_SIZE_CHARS,
)
DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
MAX_TURN_BUDGET_CHARS: int = 200_000
PREVIEW_SIZE_CHARS: int = 2_000
logger = logging.getLogger(__name__)
PERSISTED_OUTPUT_TAG = "<persisted-output>"
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
STORAGE_DIR = "/tmp/hermes-results"
@@ -112,6 +114,7 @@ def maybe_persist_tool_result(
tool_use_id: str,
env=None,
threshold: int | float | None = None,
preview_size: int = PREVIEW_SIZE_CHARS,
) -> str:
"""Layer 2: persist oversized result into the sandbox, return preview + path.
@@ -125,6 +128,7 @@ def maybe_persist_tool_result(
tool_use_id: Unique ID for this tool call (used as filename).
env: The active BaseEnvironment instance, or None.
threshold: Override threshold; if None, looked up from registry.
preview_size: Max chars for the inline preview after persistence.
Returns:
Original content if small, or <persisted-output> replacement.
@@ -143,7 +147,7 @@ def maybe_persist_tool_result(
remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
# Write raw output (not JSON wrapper) so read_file returns readable text
file_content = _extract_raw_output(content)
preview, has_more = generate_preview(file_content)
preview, has_more = generate_preview(file_content, max_chars=preview_size)
# Try writing into the sandbox
if env is not None:
@@ -173,6 +177,7 @@ def enforce_turn_budget(
tool_messages: list[dict],
env=None,
budget: int = MAX_TURN_BUDGET_CHARS,
preview_size: int = PREVIEW_SIZE_CHARS,
) -> list[dict]:
"""Layer 3: enforce aggregate budget across all tool results in a turn.
@@ -210,6 +215,7 @@ def enforce_turn_budget(
tool_use_id=tool_use_id,
env=env,
threshold=0,
preview_size=preview_size,
)
if replacement != content:
total_size -= size