Compare commits

...

1 Commits

Author SHA1 Message Date
Metatron
cc1c061a6b perf: cap marathon sessions at 200 messages with forced compression (closes #326)
Some checks failed
Notebook CI / notebook-smoke (push) Failing after 1m46s
Forge CI / smoke-and-build (pull_request) Failing after 51s
Add max_messages limit (default 200, configurable via HERMES_MAX_MESSAGES
env var). When conversation message count exceeds the cap, force a
compression pass that creates a new session lineage.

Prevents error cascading and tool fixation in marathon sessions (200+
messages) which empirically show 45-84% error rates.

Uses existing _compress_context mechanism — agent continues seamlessly
in a fresh session with compressed history.
2026-04-13 21:31:03 -04:00

View File

@@ -586,6 +586,9 @@ class AIAgent:
self.model = model
self.max_iterations = max_iterations
# Marathon session cap: force compression when messages exceed this.
# Eliminates error-cascading in 200+ message sessions (#326).
self.max_messages = int(os.environ.get("HERMES_MAX_MESSAGES", "200"))
# Shared iteration budget — parent creates, children inherit.
# Consumed by every LLM turn across parent + all subagents.
self.iteration_budget = iteration_budget or IterationBudget(max_iterations)
@@ -7328,6 +7331,24 @@ class AIAgent:
pass
while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
# Marathon session guard: if message count exceeds max_messages,
# force a compression pass to start a fresh session lineage.
# Prevents error cascading and tool fixation in long sessions. #326
if (
self.compression_enabled
and len(messages) > self.max_messages
):
if not self.quiet_mode:
self._safe_print(
f"\n🔄 Marathon session limit hit ({len(messages)} msgs > "
f"{self.max_messages} cap). Compressing and continuing..."
)
messages, active_system_prompt = self._compress_context(
messages, system_message,
approx_tokens=estimate_messages_tokens_rough(messages),
task_id=effective_task_id,
)
# Reset per-turn checkpoint dedup so each iteration can take one snapshot
self._checkpoint_mgr.new_turn()