feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration limit as the parent agent. Previously, each subagent had its own independent counter, so a parent with max_iterations=60 could spawn 3 subagents each doing 50 calls = 150 total tool calls unmetered. Changes: - IterationBudget: thread-safe shared counter (run_agent.py) - consume(): try to use one iteration, returns False if exhausted - refund(): give back one iteration (for execute_code turns) - Thread-safe via Lock (subagents run in ThreadPoolExecutor) - Parent creates the budget, children inherit it via delegate_tool.py - execute_code turns are refunded (don't count against budget) - Default raised from 60 → 90 to account for shared consumption - Per-child cap (50) still applies as a safety valve The per-child max_iterations (default 50) remains as a per-child ceiling, but the shared budget is the hard session-wide limit. A child stops at whichever comes first.
2026-03-07 08:16:37 -08:00
parent 5da55ea1e3
commit 0a82396718
4 changed files with 67 additions and 7 deletions
--- a/cli.py
+++ b/cli.py
@@ -169,7 +169,7 @@ def load_cli_config() -> Dict[str, Any]:
            "summary_model": "google/gemini-3-flash-preview",  # Fast/cheap model for summaries
        },
        "agent": {
-            "max_turns": 60,  # Default max tool-calling iterations
+            "max_turns": 90,  # Default max tool-calling iterations (shared with subagents)
            "verbose": False,
            "system_prompt": "",
            "prefill_messages_file": "",
@@ -836,7 +836,7 @@ class HermesCLI:
            provider: Inference provider ("auto", "openrouter", "nous", "openai-codex", "zai", "kimi-coding", "minimax", "minimax-cn")
            api_key: API key (default: from environment)
            base_url: API base URL (default: OpenRouter)
-            max_turns: Maximum tool-calling iterations (default: 60)
+            max_turns: Maximum tool-calling iterations shared with subagents (default: 90)
            verbose: Enable verbose logging
            compact: Use compact display mode
            resume: Session ID to resume (restores conversation history from SQLite)
@@ -889,7 +889,7 @@ class HermesCLI:
        elif os.getenv("HERMES_MAX_ITERATIONS"):
            self.max_turns = int(os.getenv("HERMES_MAX_ITERATIONS"))
        else:
-            self.max_turns = 60
+            self.max_turns = 90
        
        # Parse and validate toolsets
        self.enabled_toolsets = toolsets
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -2097,7 +2097,7 @@ class GatewayRunner:
            os.environ["HERMES_SESSION_KEY"] = session_key or ""

            # Read from env var or use default (same as CLI)
-            max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "60"))
+            max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
            
            # Map platform enum to the platform hint key the agent understands.
            # Platform.LOCAL ("local") maps to "cli"; others pass through as-is.
--- a/run_agent.py
+++ b/run_agent.py
@@ -99,6 +99,46 @@ from agent.trajectory import (
 )


+class IterationBudget:
+    """Thread-safe shared iteration counter for parent and child agents.
+
+    Tracks total LLM-call iterations consumed across a parent agent and all
+    its subagents.  A single ``IterationBudget`` is created by the parent
+    and passed to every child so they share the same cap.
+
+    ``execute_code`` (programmatic tool calling) iterations are refunded via
+    :meth:`refund` so they don't eat into the budget.
+    """
+
+    def __init__(self, max_total: int):
+        self.max_total = max_total
+        self._used = 0
+        self._lock = threading.Lock()
+
+    def consume(self) -> bool:
+        """Try to consume one iteration.  Returns True if allowed."""
+        with self._lock:
+            if self._used >= self.max_total:
+                return False
+            self._used += 1
+            return True
+
+    def refund(self) -> None:
+        """Give back one iteration (e.g. for execute_code turns)."""
+        with self._lock:
+            if self._used > 0:
+                self._used -= 1
+
+    @property
+    def used(self) -> int:
+        return self._used
+
+    @property
+    def remaining(self) -> int:
+        with self._lock:
+            return max(0, self.max_total - self._used)
+
+
 class AIAgent:
    """
    AI Agent with tool calling capabilities.
@@ -114,7 +154,7 @@ class AIAgent:
        provider: str = None,
        api_mode: str = None,
        model: str = "anthropic/claude-opus-4.6",  # OpenRouter format
-        max_iterations: int = 60,  # Default tool-calling iterations
+        max_iterations: int = 90,  # Default tool-calling iterations (shared with subagents)
        tool_delay: float = 1.0,
        enabled_toolsets: List[str] = None,
        disabled_toolsets: List[str] = None,
@@ -142,6 +182,7 @@ class AIAgent:
        skip_memory: bool = False,
        session_db=None,
        honcho_session_key: str = None,
+        iteration_budget: "IterationBudget" = None,
    ):
        """
        Initialize the AI Agent.
@@ -152,7 +193,7 @@ class AIAgent:
            provider (str): Provider identifier (optional; used for telemetry/routing hints)
            api_mode (str): API mode override: "chat_completions" or "codex_responses"
            model (str): Model name to use (default: "anthropic/claude-opus-4.6")
-            max_iterations (int): Maximum number of tool calling iterations (default: 60)
+            max_iterations (int): Maximum number of tool calling iterations (default: 90)
            tool_delay (float): Delay between tool calls in seconds (default: 1.0)
            enabled_toolsets (List[str]): Only enable tools from these toolsets (optional)
            disabled_toolsets (List[str]): Disable tools from these toolsets (optional)
@@ -186,6 +227,9 @@ class AIAgent:
        """
        self.model = model
        self.max_iterations = max_iterations
+        # Shared iteration budget — parent creates, children inherit.
+        # Consumed by every LLM turn across parent + all subagents.
+        self.iteration_budget = iteration_budget or IterationBudget(max_iterations)
        self.tool_delay = tool_delay
        self.save_trajectories = save_trajectories
        self.verbose_logging = verbose_logging
@@ -2974,7 +3018,7 @@ class AIAgent:
        # Clear any stale interrupt state at start
        self.clear_interrupt()
        
-        while api_call_count < self.max_iterations:
+        while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
            # Check for interrupt request (e.g., user sent new message)
            if self._interrupt_requested:
                interrupted = True
@@ -2983,6 +3027,10 @@ class AIAgent:
                break
            
            api_call_count += 1
+            if not self.iteration_budget.consume():
+                if not self.quiet_mode:
+                    print(f"\n⚠️  Session iteration budget exhausted ({self.iteration_budget.max_total} total across agent + subagents)")
+                break

            # Fire step_callback for gateway hooks (agent:step event)
            if self.step_callback is not None:
@@ -3749,6 +3797,13 @@ class AIAgent:
                    self._log_msg_to_db(assistant_msg)
                    
                    self._execute_tool_calls(assistant_message, messages, effective_task_id)
+
+                    # Refund the iteration if the ONLY tool(s) called were
+                    # execute_code (programmatic tool calling).  These are
+                    # cheap RPC-style calls that shouldn't eat the budget.
+                    _tc_names = {tc.function.name for tc in assistant_message.tool_calls}
+                    if _tc_names == {"execute_code"}:
+                        self.iteration_budget.refund()
                    
                    if self.compression_enabled and self.context_compressor.should_compress():
                        messages, active_system_prompt = self._compress_context(
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@@ -194,6 +194,10 @@ def _run_single_child(
        # Build progress callback to relay tool calls to parent display
        child_progress_cb = _build_child_progress_callback(task_index, parent_agent, task_count)

+        # Share the parent's iteration budget so subagent tool calls
+        # count toward the session-wide limit.
+        shared_budget = getattr(parent_agent, "iteration_budget", None)
+
        child = AIAgent(
            base_url=parent_agent.base_url,
            api_key=parent_api_key,
@@ -215,6 +219,7 @@ def _run_single_child(
            providers_order=parent_agent.providers_order,
            provider_sort=parent_agent.provider_sort,
            tool_progress_callback=child_progress_cb,
+            iteration_budget=shared_budget,
        )

        # Set delegation depth so children can't spawn grandchildren