fix(gateway): replace wall-clock agent timeout with inactivity-based timeout (#5389)

The gateway previously used a hard wall-clock asyncio.wait_for timeout
that killed agents after a fixed duration regardless of activity. This
punished legitimate long-running tasks (subagent delegation, reasoning
models, multi-step research).

Now uses an inactivity-based polling loop that checks the agent's
built-in activity tracker (get_activity_summary) every 5 seconds. The
agent can run indefinitely as long as it's actively calling tools or
receiving API responses. Only fires when the agent has been completely
idle for the configured duration.

Changes:
- Replace asyncio.wait_for with asyncio.wait poll loop checking
  agent idle time via get_activity_summary()
- Add agent.gateway_timeout config.yaml key (default 1800s, 0=unlimited)
- Update stale session eviction to use agent idle time instead of
  pure wall-clock (prevents evicting active long-running tasks)
- Preserve all existing diagnostic logging and user-facing context

Inspired by PR #4864 (Mibayy) and issue #4815 (BongSuCHOI).
Reimplemented on current main using existing _touch_activity()
infrastructure rather than a parallel tracker.
This commit is contained in:
Teknium
2026-04-05 19:38:21 -07:00
committed by GitHub
parent 8972eb05fd
commit fec58ad99e
2 changed files with 91 additions and 36 deletions

View File

@@ -182,6 +182,10 @@ if _config_path.exists():
if _agent_cfg and isinstance(_agent_cfg, dict):
if "max_turns" in _agent_cfg:
os.environ["HERMES_MAX_ITERATIONS"] = str(_agent_cfg["max_turns"])
# Bridge agent.gateway_timeout → HERMES_AGENT_TIMEOUT env var.
# Env var from .env takes precedence (already in os.environ).
if "gateway_timeout" in _agent_cfg and "HERMES_AGENT_TIMEOUT" not in os.environ:
os.environ["HERMES_AGENT_TIMEOUT"] = str(_agent_cfg["gateway_timeout"])
# Timezone: bridge config.yaml → HERMES_TIMEZONE env var.
# HERMES_TIMEZONE from .env takes precedence (already in os.environ).
_tz_cfg = _cfg.get("timezone", "")
@@ -1800,32 +1804,46 @@ class GatewayRunner:
# simultaneous updates. Do NOT interrupt for photo-only follow-ups here;
# let the adapter-level batching/queueing logic absorb them.
# Staleness eviction: if an entry has been in _running_agents for
# longer than the agent timeout, it's a leaked lock from a hung or
# crashed handler. Evict it so the session isn't permanently stuck.
# Staleness eviction: detect leaked locks from hung/crashed handlers.
# With inactivity-based timeout, active tasks can run for hours, so
# wall-clock age alone isn't sufficient. Evict only when the agent
# has been *idle* beyond the inactivity threshold (or when the agent
# object has no activity tracker and wall-clock age is extreme).
_raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
_STALE_TTL = (_raw_stale_timeout + 60) if _raw_stale_timeout > 0 else float("inf")
_stale_ts = self._running_agents_ts.get(_quick_key, 0)
if _quick_key in self._running_agents and _stale_ts and (time.time() - _stale_ts) > _STALE_TTL:
if _quick_key in self._running_agents and _stale_ts:
_stale_age = time.time() - _stale_ts
_stale_agent = self._running_agents.get(_quick_key)
_stale_idle = float("inf") # assume idle if we can't check
_stale_detail = ""
if _stale_agent and hasattr(_stale_agent, "get_activity_summary"):
try:
_sa = _stale_agent.get_activity_summary()
_stale_idle = _sa.get("seconds_since_activity", float("inf"))
_stale_detail = (
f" | last_activity={_sa.get('last_activity_desc', 'unknown')} "
f"({_sa.get('seconds_since_activity', 0):.0f}s ago) "
f"({_stale_idle:.0f}s ago) "
f"| iteration={_sa.get('api_call_count', 0)}/{_sa.get('max_iterations', 0)}"
)
except Exception:
pass
logger.warning(
"Evicting stale _running_agents entry for %s (age: %.0fs, TTL: %.0fs)%s",
_quick_key[:30], _stale_age, _STALE_TTL, _stale_detail,
# Evict if: agent is idle beyond timeout, OR wall-clock age is
# extreme (10x timeout or 2h, whichever is larger — catches
# cases where the agent object was garbage-collected).
_wall_ttl = max(_raw_stale_timeout * 10, 7200) if _raw_stale_timeout > 0 else float("inf")
_should_evict = (
(_raw_stale_timeout > 0 and _stale_idle >= _raw_stale_timeout)
or _stale_age > _wall_ttl
)
del self._running_agents[_quick_key]
self._running_agents_ts.pop(_quick_key, None)
if _should_evict:
logger.warning(
"Evicting stale _running_agents entry for %s "
"(age: %.0fs, idle: %.0fs, timeout: %.0fs)%s",
_quick_key[:30], _stale_age, _stale_idle,
_raw_stale_timeout, _stale_detail,
)
del self._running_agents[_quick_key]
self._running_agents_ts.pop(_quick_key, None)
if _quick_key in self._running_agents:
if event.get_command() == "status":
@@ -6766,19 +6784,54 @@ class GatewayRunner:
_notify_task = asyncio.create_task(_notify_long_running())
try:
# Run in thread pool to not block. Cap total execution time
# so a hung API call or runaway tool doesn't permanently lock
# the session. Default 30 minutes; override with env var.
# Set to 0 for no limit (infinite).
# Run in thread pool to not block. Use an *inactivity*-based
# timeout instead of a wall-clock limit: the agent can run for
# hours if it's actively calling tools / receiving stream tokens,
# but a hung API call or stuck tool with no activity for the
# configured duration is caught and killed. (#4815)
#
# Config: agent.gateway_timeout in config.yaml, or
# HERMES_AGENT_TIMEOUT env var (env var takes precedence).
# Default 1800s (30 min inactivity). 0 = unlimited.
_agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
_agent_timeout = _agent_timeout_raw if _agent_timeout_raw > 0 else None
loop = asyncio.get_event_loop()
try:
response = await asyncio.wait_for(
loop.run_in_executor(None, run_sync),
timeout=_agent_timeout,
)
except asyncio.TimeoutError:
_executor_task = asyncio.ensure_future(
loop.run_in_executor(None, run_sync)
)
_inactivity_timeout = False
_POLL_INTERVAL = 5.0
if _agent_timeout is None:
# Unlimited — just await the result.
response = await _executor_task
else:
# Poll loop: check the agent's built-in activity tracker
# (updated by _touch_activity() on every tool call, API
# call, and stream delta) every few seconds.
response = None
while True:
done, _ = await asyncio.wait(
{_executor_task}, timeout=_POLL_INTERVAL
)
if done:
response = _executor_task.result()
break
# Agent still running — check inactivity.
_agent_ref = agent_holder[0]
_idle_secs = 0.0
if _agent_ref and hasattr(_agent_ref, "get_activity_summary"):
try:
_act = _agent_ref.get_activity_summary()
_idle_secs = _act.get("seconds_since_activity", 0.0)
except Exception:
pass
if _idle_secs >= _agent_timeout:
_inactivity_timeout = True
break
if _inactivity_timeout:
# Build a diagnostic summary from the agent's activity tracker.
_timed_out_agent = agent_holder[0]
_activity = {}
@@ -6795,29 +6848,26 @@ class GatewayRunner:
_iter_max = _activity.get("max_iterations", 0)
logger.error(
"Agent execution timed out after %.0fs for session %s "
"| last_activity=%.0fs ago (%s) | iteration=%s/%s | tool=%s",
_agent_timeout, session_key,
_secs_ago, _last_desc, _iter_n, _iter_max,
"Agent idle for %.0fs (timeout %.0fs) in session %s "
"| last_activity=%s | iteration=%s/%s | tool=%s",
_secs_ago, _agent_timeout, session_key,
_last_desc, _iter_n, _iter_max,
_cur_tool or "none",
)
# Interrupt the agent if it's still running so the thread
# pool worker is freed.
if _timed_out_agent and hasattr(_timed_out_agent, "interrupt"):
_timed_out_agent.interrupt("Execution timed out")
_timed_out_agent.interrupt("Execution timed out (inactivity)")
_timeout_mins = int(_agent_timeout // 60)
_timeout_mins = int(_agent_timeout // 60) or 1
# Construct a user-facing message with diagnostic context.
_diag_lines = [f"⏱️ Request timed out after {_timeout_mins} minutes."]
if _secs_ago < 30:
_diag_lines.append(
f"The agent was actively working when the timeout fired "
f"(last activity: {_last_desc}, {_secs_ago:.0f}s ago, "
f"iteration {_iter_n}/{_iter_max})."
)
elif _cur_tool:
_diag_lines = [
f"⏱️ Agent inactive for {_timeout_mins} min — no tool calls "
f"or API responses."
]
if _cur_tool:
_diag_lines.append(
f"The agent appears stuck on tool `{_cur_tool}` "
f"({_secs_ago:.0f}s since last activity, "
@@ -6830,7 +6880,7 @@ class GatewayRunner:
"The agent may have been waiting on an API response."
)
_diag_lines.append(
"To increase the limit, set HERMES_AGENT_TIMEOUT in your .env "
"To increase the limit, set agent.gateway_timeout in config.yaml "
"(value in seconds, 0 = no limit) and restart the gateway.\n"
"Try again, or use /reset to start fresh."
)

View File

@@ -205,6 +205,11 @@ DEFAULT_CONFIG = {
"toolsets": ["hermes-cli"],
"agent": {
"max_turns": 90,
# Inactivity timeout for gateway agent execution (seconds).
# The agent can run indefinitely as long as it's actively calling
# tools or receiving API responses. Only fires when the agent has
# been completely idle for this duration. 0 = unlimited.
"gateway_timeout": 1800,
# Tool-use enforcement: injects system prompt guidance that tells the
# model to actually call tools instead of describing intended actions.
# Values: "auto" (default — applies to gpt/codex models), true/false