feat(budget): make tool result persistence thresholds configurable
Add BudgetConfig dataclass to centralize and make overridable the hardcoded constants (50K per-result, 200K per-turn, 2K preview) that control when tool outputs get persisted to sandbox. Configurable at the RL environment level via HermesAgentEnvConfig fields, threaded through HermesAgentLoop to the storage layer. Resolution: pinned (read_file=inf) > env config overrides > registry per-tool > default. CLI override: --env.turn_budget_chars 80000
This commit is contained in:
@@ -140,6 +140,7 @@ class HermesAgentLoop:
|
||||
temperature: float = 1.0,
|
||||
max_tokens: Optional[int] = None,
|
||||
extra_body: Optional[Dict[str, Any]] = None,
|
||||
budget_config: Optional["BudgetConfig"] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the agent loop.
|
||||
@@ -156,7 +157,11 @@ class HermesAgentLoop:
|
||||
extra_body: Extra parameters passed to the OpenAI client's create() call.
|
||||
Used for OpenRouter provider preferences, transforms, etc.
|
||||
e.g. {"provider": {"ignore": ["DeepInfra"]}}
|
||||
budget_config: Tool result persistence budget. Controls per-tool
|
||||
thresholds, per-turn aggregate budget, and preview size.
|
||||
If None, uses DEFAULT_BUDGET (current hardcoded values).
|
||||
"""
|
||||
from tools.budget_config import DEFAULT_BUDGET
|
||||
self.server = server
|
||||
self.tool_schemas = tool_schemas
|
||||
self.valid_tool_names = valid_tool_names
|
||||
@@ -165,6 +170,7 @@ class HermesAgentLoop:
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
self.extra_body = extra_body
|
||||
self.budget_config = budget_config or DEFAULT_BUDGET
|
||||
|
||||
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
|
||||
"""
|
||||
@@ -455,6 +461,8 @@ class HermesAgentLoop:
|
||||
tool_name=tool_name,
|
||||
tool_use_id=tc_id,
|
||||
env=get_active_env(self.task_id),
|
||||
threshold=self.budget_config.resolve_threshold(tool_name),
|
||||
preview_size=self.budget_config.preview_size,
|
||||
)
|
||||
except Exception:
|
||||
pass # Persistence is best-effort in eval path
|
||||
@@ -470,7 +478,12 @@ class HermesAgentLoop:
|
||||
try:
|
||||
num_tcs = len(assistant_msg.tool_calls)
|
||||
if num_tcs > 0:
|
||||
enforce_turn_budget(messages[-num_tcs:], env=get_active_env(self.task_id))
|
||||
enforce_turn_budget(
|
||||
messages[-num_tcs:],
|
||||
env=get_active_env(self.task_id),
|
||||
budget=self.budget_config.turn_budget,
|
||||
preview_size=self.budget_config.preview_size,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -1048,6 +1048,7 @@ class AgenticOPDEnv(HermesAgentBaseEnv):
|
||||
temperature=0.0,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
|
||||
@@ -541,6 +541,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
@@ -553,6 +554,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
|
||||
@@ -549,6 +549,7 @@ class YCBenchEvalEnv(HermesAgentBaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
|
||||
@@ -62,6 +62,11 @@ from atroposlib.type_definitions import Item
|
||||
|
||||
from environments.agent_loop import AgentResult, HermesAgentLoop
|
||||
from environments.tool_context import ToolContext
|
||||
from tools.budget_config import (
|
||||
DEFAULT_RESULT_SIZE_CHARS,
|
||||
DEFAULT_TURN_BUDGET_CHARS,
|
||||
DEFAULT_PREVIEW_SIZE_CHARS,
|
||||
)
|
||||
|
||||
# Import hermes-agent toolset infrastructure
|
||||
from model_tools import get_tool_definitions
|
||||
@@ -160,6 +165,32 @@ class HermesAgentEnvConfig(BaseEnvConfig):
|
||||
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
|
||||
)
|
||||
|
||||
# --- Tool result budget ---
|
||||
# Defaults imported from tools.budget_config (single source of truth).
|
||||
default_result_size_chars: int = Field(
|
||||
default=DEFAULT_RESULT_SIZE_CHARS,
|
||||
description="Default per-tool threshold (chars) for persisting large results "
|
||||
"to sandbox. Results exceeding this are written to /tmp/hermes-results/ "
|
||||
"and replaced with a preview. Per-tool registry values take precedence "
|
||||
"unless overridden via tool_result_overrides.",
|
||||
)
|
||||
turn_budget_chars: int = Field(
|
||||
default=DEFAULT_TURN_BUDGET_CHARS,
|
||||
description="Aggregate char budget per assistant turn. If all tool results "
|
||||
"in a single turn exceed this, the largest are persisted to disk first.",
|
||||
)
|
||||
preview_size_chars: int = Field(
|
||||
default=DEFAULT_PREVIEW_SIZE_CHARS,
|
||||
description="Size of the inline preview shown after a tool result is persisted.",
|
||||
)
|
||||
tool_result_overrides: Optional[Dict[str, int]] = Field(
|
||||
default=None,
|
||||
description="Per-tool threshold overrides (chars). Keys are tool names, "
|
||||
"values are char thresholds. Overrides both the default and registry "
|
||||
"per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. "
|
||||
"Note: read_file is pinned to infinity and cannot be overridden.",
|
||||
)
|
||||
|
||||
# --- Provider-specific parameters ---
|
||||
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
|
||||
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
|
||||
@@ -176,6 +207,16 @@ class HermesAgentEnvConfig(BaseEnvConfig):
|
||||
"transforms, and other provider-specific settings.",
|
||||
)
|
||||
|
||||
def build_budget_config(self):
|
||||
"""Build a BudgetConfig from env config fields."""
|
||||
from tools.budget_config import BudgetConfig
|
||||
return BudgetConfig(
|
||||
default_result_size=self.default_result_size_chars,
|
||||
turn_budget=self.turn_budget_chars,
|
||||
preview_size=self.preview_size_chars,
|
||||
tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {},
|
||||
)
|
||||
|
||||
|
||||
class HermesAgentBaseEnv(BaseEnv):
|
||||
"""
|
||||
@@ -490,6 +531,7 @@ class HermesAgentBaseEnv(BaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
except NotImplementedError:
|
||||
@@ -507,6 +549,7 @@ class HermesAgentBaseEnv(BaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
else:
|
||||
@@ -520,6 +563,7 @@ class HermesAgentBaseEnv(BaseEnv):
|
||||
temperature=self.config.agent_temperature,
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
|
||||
@@ -472,6 +472,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
|
||||
temperature=0.0, # Deterministic for eval
|
||||
max_tokens=self.config.max_token_length,
|
||||
extra_body=self.config.extra_body,
|
||||
budget_config=self.config.build_budget_config(),
|
||||
)
|
||||
result = await agent.run(messages)
|
||||
|
||||
|
||||
52
tools/budget_config.py
Normal file
52
tools/budget_config.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Configurable budget constants for tool result persistence.
|
||||
|
||||
Overridable at the RL environment level via HermesAgentEnvConfig fields.
|
||||
Per-tool resolution: pinned > config overrides > registry > default.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict
|
||||
|
||||
# Tools whose thresholds must never be overridden.
|
||||
# read_file=inf prevents infinite persist->read->persist loops.
|
||||
PINNED_THRESHOLDS: Dict[str, float] = {
|
||||
"read_file": float("inf"),
|
||||
}
|
||||
|
||||
# Defaults matching the current hardcoded values in tool_result_storage.py.
|
||||
# Kept here as the single source of truth; tool_result_storage.py imports these.
|
||||
DEFAULT_RESULT_SIZE_CHARS: int = 50_000
|
||||
DEFAULT_TURN_BUDGET_CHARS: int = 200_000
|
||||
DEFAULT_PREVIEW_SIZE_CHARS: int = 2_000
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class BudgetConfig:
|
||||
"""Immutable budget constants for the 3-layer tool result persistence system.
|
||||
|
||||
Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars.
|
||||
Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool
|
||||
results in a single assistant turn.
|
||||
Preview: preview_size -> inline snippet size after persistence.
|
||||
"""
|
||||
|
||||
default_result_size: int = DEFAULT_RESULT_SIZE_CHARS
|
||||
turn_budget: int = DEFAULT_TURN_BUDGET_CHARS
|
||||
preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS
|
||||
tool_overrides: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
def resolve_threshold(self, tool_name: str) -> int | float:
|
||||
"""Resolve the persistence threshold for a tool.
|
||||
|
||||
Priority: pinned -> tool_overrides -> registry per-tool -> default.
|
||||
"""
|
||||
if tool_name in PINNED_THRESHOLDS:
|
||||
return PINNED_THRESHOLDS[tool_name]
|
||||
if tool_name in self.tool_overrides:
|
||||
return self.tool_overrides[tool_name]
|
||||
from tools.registry import registry
|
||||
return registry.get_max_result_size(tool_name, default=self.default_result_size)
|
||||
|
||||
|
||||
# Default config -- matches current hardcoded behavior exactly.
|
||||
DEFAULT_BUDGET = BudgetConfig()
|
||||
@@ -169,12 +169,14 @@ class ToolRegistry:
|
||||
# Query helpers (replace redundant dicts in model_tools.py)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_max_result_size(self, name: str) -> int | float:
|
||||
"""Return per-tool max result size, or global default."""
|
||||
from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
|
||||
def get_max_result_size(self, name: str, default: int | float | None = None) -> int | float:
|
||||
"""Return per-tool max result size, or *default* (or global default)."""
|
||||
entry = self._tools.get(name)
|
||||
if entry and entry.max_result_size_chars is not None:
|
||||
return entry.max_result_size_chars
|
||||
if default is not None:
|
||||
return default
|
||||
from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
|
||||
return DEFAULT_MAX_RESULT_SIZE_CHARS
|
||||
|
||||
def get_all_tool_names(self) -> List[str]:
|
||||
|
||||
@@ -24,11 +24,13 @@ import json
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from tools.budget_config import (
|
||||
DEFAULT_RESULT_SIZE_CHARS as DEFAULT_MAX_RESULT_SIZE_CHARS,
|
||||
DEFAULT_TURN_BUDGET_CHARS as MAX_TURN_BUDGET_CHARS,
|
||||
DEFAULT_PREVIEW_SIZE_CHARS as PREVIEW_SIZE_CHARS,
|
||||
)
|
||||
|
||||
DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
|
||||
MAX_TURN_BUDGET_CHARS: int = 200_000
|
||||
PREVIEW_SIZE_CHARS: int = 2_000
|
||||
logger = logging.getLogger(__name__)
|
||||
PERSISTED_OUTPUT_TAG = "<persisted-output>"
|
||||
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
|
||||
STORAGE_DIR = "/tmp/hermes-results"
|
||||
@@ -112,6 +114,7 @@ def maybe_persist_tool_result(
|
||||
tool_use_id: str,
|
||||
env=None,
|
||||
threshold: int | float | None = None,
|
||||
preview_size: int = PREVIEW_SIZE_CHARS,
|
||||
) -> str:
|
||||
"""Layer 2: persist oversized result into the sandbox, return preview + path.
|
||||
|
||||
@@ -125,6 +128,7 @@ def maybe_persist_tool_result(
|
||||
tool_use_id: Unique ID for this tool call (used as filename).
|
||||
env: The active BaseEnvironment instance, or None.
|
||||
threshold: Override threshold; if None, looked up from registry.
|
||||
preview_size: Max chars for the inline preview after persistence.
|
||||
|
||||
Returns:
|
||||
Original content if small, or <persisted-output> replacement.
|
||||
@@ -143,7 +147,7 @@ def maybe_persist_tool_result(
|
||||
remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
|
||||
# Write raw output (not JSON wrapper) so read_file returns readable text
|
||||
file_content = _extract_raw_output(content)
|
||||
preview, has_more = generate_preview(file_content)
|
||||
preview, has_more = generate_preview(file_content, max_chars=preview_size)
|
||||
|
||||
# Try writing into the sandbox
|
||||
if env is not None:
|
||||
@@ -173,6 +177,7 @@ def enforce_turn_budget(
|
||||
tool_messages: list[dict],
|
||||
env=None,
|
||||
budget: int = MAX_TURN_BUDGET_CHARS,
|
||||
preview_size: int = PREVIEW_SIZE_CHARS,
|
||||
) -> list[dict]:
|
||||
"""Layer 3: enforce aggregate budget across all tool results in a turn.
|
||||
|
||||
@@ -210,6 +215,7 @@ def enforce_turn_budget(
|
||||
tool_use_id=tool_use_id,
|
||||
env=env,
|
||||
threshold=0,
|
||||
preview_size=preview_size,
|
||||
)
|
||||
if replacement != content:
|
||||
total_size -= size
|
||||
|
||||
Reference in New Issue
Block a user