fix(gateway): replace wall-clock agent timeout with inactivity-based timeout (#5389)
The gateway previously used a hard wall-clock asyncio.wait_for timeout that killed agents after a fixed duration regardless of activity. This punished legitimate long-running tasks (subagent delegation, reasoning models, multi-step research). Now uses an inactivity-based polling loop that checks the agent's built-in activity tracker (get_activity_summary) every 5 seconds. The agent can run indefinitely as long as it's actively calling tools or receiving API responses. Only fires when the agent has been completely idle for the configured duration. Changes: - Replace asyncio.wait_for with asyncio.wait poll loop checking agent idle time via get_activity_summary() - Add agent.gateway_timeout config.yaml key (default 1800s, 0=unlimited) - Update stale session eviction to use agent idle time instead of pure wall-clock (prevents evicting active long-running tasks) - Preserve all existing diagnostic logging and user-facing context Inspired by PR #4864 (Mibayy) and issue #4815 (BongSuCHOI). Reimplemented on current main using existing _touch_activity() infrastructure rather than a parallel tracker.
This commit is contained in:
122
gateway/run.py
122
gateway/run.py
@@ -182,6 +182,10 @@ if _config_path.exists():
|
||||
if _agent_cfg and isinstance(_agent_cfg, dict):
|
||||
if "max_turns" in _agent_cfg:
|
||||
os.environ["HERMES_MAX_ITERATIONS"] = str(_agent_cfg["max_turns"])
|
||||
# Bridge agent.gateway_timeout → HERMES_AGENT_TIMEOUT env var.
|
||||
# Env var from .env takes precedence (already in os.environ).
|
||||
if "gateway_timeout" in _agent_cfg and "HERMES_AGENT_TIMEOUT" not in os.environ:
|
||||
os.environ["HERMES_AGENT_TIMEOUT"] = str(_agent_cfg["gateway_timeout"])
|
||||
# Timezone: bridge config.yaml → HERMES_TIMEZONE env var.
|
||||
# HERMES_TIMEZONE from .env takes precedence (already in os.environ).
|
||||
_tz_cfg = _cfg.get("timezone", "")
|
||||
@@ -1800,32 +1804,46 @@ class GatewayRunner:
|
||||
# simultaneous updates. Do NOT interrupt for photo-only follow-ups here;
|
||||
# let the adapter-level batching/queueing logic absorb them.
|
||||
|
||||
# Staleness eviction: if an entry has been in _running_agents for
|
||||
# longer than the agent timeout, it's a leaked lock from a hung or
|
||||
# crashed handler. Evict it so the session isn't permanently stuck.
|
||||
# Staleness eviction: detect leaked locks from hung/crashed handlers.
|
||||
# With inactivity-based timeout, active tasks can run for hours, so
|
||||
# wall-clock age alone isn't sufficient. Evict only when the agent
|
||||
# has been *idle* beyond the inactivity threshold (or when the agent
|
||||
# object has no activity tracker and wall-clock age is extreme).
|
||||
_raw_stale_timeout = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
|
||||
_STALE_TTL = (_raw_stale_timeout + 60) if _raw_stale_timeout > 0 else float("inf")
|
||||
_stale_ts = self._running_agents_ts.get(_quick_key, 0)
|
||||
if _quick_key in self._running_agents and _stale_ts and (time.time() - _stale_ts) > _STALE_TTL:
|
||||
if _quick_key in self._running_agents and _stale_ts:
|
||||
_stale_age = time.time() - _stale_ts
|
||||
_stale_agent = self._running_agents.get(_quick_key)
|
||||
_stale_idle = float("inf") # assume idle if we can't check
|
||||
_stale_detail = ""
|
||||
if _stale_agent and hasattr(_stale_agent, "get_activity_summary"):
|
||||
try:
|
||||
_sa = _stale_agent.get_activity_summary()
|
||||
_stale_idle = _sa.get("seconds_since_activity", float("inf"))
|
||||
_stale_detail = (
|
||||
f" | last_activity={_sa.get('last_activity_desc', 'unknown')} "
|
||||
f"({_sa.get('seconds_since_activity', 0):.0f}s ago) "
|
||||
f"({_stale_idle:.0f}s ago) "
|
||||
f"| iteration={_sa.get('api_call_count', 0)}/{_sa.get('max_iterations', 0)}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
logger.warning(
|
||||
"Evicting stale _running_agents entry for %s (age: %.0fs, TTL: %.0fs)%s",
|
||||
_quick_key[:30], _stale_age, _STALE_TTL, _stale_detail,
|
||||
# Evict if: agent is idle beyond timeout, OR wall-clock age is
|
||||
# extreme (10x timeout or 2h, whichever is larger — catches
|
||||
# cases where the agent object was garbage-collected).
|
||||
_wall_ttl = max(_raw_stale_timeout * 10, 7200) if _raw_stale_timeout > 0 else float("inf")
|
||||
_should_evict = (
|
||||
(_raw_stale_timeout > 0 and _stale_idle >= _raw_stale_timeout)
|
||||
or _stale_age > _wall_ttl
|
||||
)
|
||||
del self._running_agents[_quick_key]
|
||||
self._running_agents_ts.pop(_quick_key, None)
|
||||
if _should_evict:
|
||||
logger.warning(
|
||||
"Evicting stale _running_agents entry for %s "
|
||||
"(age: %.0fs, idle: %.0fs, timeout: %.0fs)%s",
|
||||
_quick_key[:30], _stale_age, _stale_idle,
|
||||
_raw_stale_timeout, _stale_detail,
|
||||
)
|
||||
del self._running_agents[_quick_key]
|
||||
self._running_agents_ts.pop(_quick_key, None)
|
||||
|
||||
if _quick_key in self._running_agents:
|
||||
if event.get_command() == "status":
|
||||
@@ -6766,19 +6784,54 @@ class GatewayRunner:
|
||||
_notify_task = asyncio.create_task(_notify_long_running())
|
||||
|
||||
try:
|
||||
# Run in thread pool to not block. Cap total execution time
|
||||
# so a hung API call or runaway tool doesn't permanently lock
|
||||
# the session. Default 30 minutes; override with env var.
|
||||
# Set to 0 for no limit (infinite).
|
||||
# Run in thread pool to not block. Use an *inactivity*-based
|
||||
# timeout instead of a wall-clock limit: the agent can run for
|
||||
# hours if it's actively calling tools / receiving stream tokens,
|
||||
# but a hung API call or stuck tool with no activity for the
|
||||
# configured duration is caught and killed. (#4815)
|
||||
#
|
||||
# Config: agent.gateway_timeout in config.yaml, or
|
||||
# HERMES_AGENT_TIMEOUT env var (env var takes precedence).
|
||||
# Default 1800s (30 min inactivity). 0 = unlimited.
|
||||
_agent_timeout_raw = float(os.getenv("HERMES_AGENT_TIMEOUT", 1800))
|
||||
_agent_timeout = _agent_timeout_raw if _agent_timeout_raw > 0 else None
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
response = await asyncio.wait_for(
|
||||
loop.run_in_executor(None, run_sync),
|
||||
timeout=_agent_timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
_executor_task = asyncio.ensure_future(
|
||||
loop.run_in_executor(None, run_sync)
|
||||
)
|
||||
|
||||
_inactivity_timeout = False
|
||||
_POLL_INTERVAL = 5.0
|
||||
|
||||
if _agent_timeout is None:
|
||||
# Unlimited — just await the result.
|
||||
response = await _executor_task
|
||||
else:
|
||||
# Poll loop: check the agent's built-in activity tracker
|
||||
# (updated by _touch_activity() on every tool call, API
|
||||
# call, and stream delta) every few seconds.
|
||||
response = None
|
||||
while True:
|
||||
done, _ = await asyncio.wait(
|
||||
{_executor_task}, timeout=_POLL_INTERVAL
|
||||
)
|
||||
if done:
|
||||
response = _executor_task.result()
|
||||
break
|
||||
# Agent still running — check inactivity.
|
||||
_agent_ref = agent_holder[0]
|
||||
_idle_secs = 0.0
|
||||
if _agent_ref and hasattr(_agent_ref, "get_activity_summary"):
|
||||
try:
|
||||
_act = _agent_ref.get_activity_summary()
|
||||
_idle_secs = _act.get("seconds_since_activity", 0.0)
|
||||
except Exception:
|
||||
pass
|
||||
if _idle_secs >= _agent_timeout:
|
||||
_inactivity_timeout = True
|
||||
break
|
||||
|
||||
if _inactivity_timeout:
|
||||
# Build a diagnostic summary from the agent's activity tracker.
|
||||
_timed_out_agent = agent_holder[0]
|
||||
_activity = {}
|
||||
@@ -6795,29 +6848,26 @@ class GatewayRunner:
|
||||
_iter_max = _activity.get("max_iterations", 0)
|
||||
|
||||
logger.error(
|
||||
"Agent execution timed out after %.0fs for session %s "
|
||||
"| last_activity=%.0fs ago (%s) | iteration=%s/%s | tool=%s",
|
||||
_agent_timeout, session_key,
|
||||
_secs_ago, _last_desc, _iter_n, _iter_max,
|
||||
"Agent idle for %.0fs (timeout %.0fs) in session %s "
|
||||
"| last_activity=%s | iteration=%s/%s | tool=%s",
|
||||
_secs_ago, _agent_timeout, session_key,
|
||||
_last_desc, _iter_n, _iter_max,
|
||||
_cur_tool or "none",
|
||||
)
|
||||
|
||||
# Interrupt the agent if it's still running so the thread
|
||||
# pool worker is freed.
|
||||
if _timed_out_agent and hasattr(_timed_out_agent, "interrupt"):
|
||||
_timed_out_agent.interrupt("Execution timed out")
|
||||
_timed_out_agent.interrupt("Execution timed out (inactivity)")
|
||||
|
||||
_timeout_mins = int(_agent_timeout // 60)
|
||||
_timeout_mins = int(_agent_timeout // 60) or 1
|
||||
|
||||
# Construct a user-facing message with diagnostic context.
|
||||
_diag_lines = [f"⏱️ Request timed out after {_timeout_mins} minutes."]
|
||||
if _secs_ago < 30:
|
||||
_diag_lines.append(
|
||||
f"The agent was actively working when the timeout fired "
|
||||
f"(last activity: {_last_desc}, {_secs_ago:.0f}s ago, "
|
||||
f"iteration {_iter_n}/{_iter_max})."
|
||||
)
|
||||
elif _cur_tool:
|
||||
_diag_lines = [
|
||||
f"⏱️ Agent inactive for {_timeout_mins} min — no tool calls "
|
||||
f"or API responses."
|
||||
]
|
||||
if _cur_tool:
|
||||
_diag_lines.append(
|
||||
f"The agent appears stuck on tool `{_cur_tool}` "
|
||||
f"({_secs_ago:.0f}s since last activity, "
|
||||
@@ -6830,7 +6880,7 @@ class GatewayRunner:
|
||||
"The agent may have been waiting on an API response."
|
||||
)
|
||||
_diag_lines.append(
|
||||
"To increase the limit, set HERMES_AGENT_TIMEOUT in your .env "
|
||||
"To increase the limit, set agent.gateway_timeout in config.yaml "
|
||||
"(value in seconds, 0 = no limit) and restart the gateway.\n"
|
||||
"Try again, or use /reset to start fresh."
|
||||
)
|
||||
|
||||
@@ -205,6 +205,11 @@ DEFAULT_CONFIG = {
|
||||
"toolsets": ["hermes-cli"],
|
||||
"agent": {
|
||||
"max_turns": 90,
|
||||
# Inactivity timeout for gateway agent execution (seconds).
|
||||
# The agent can run indefinitely as long as it's actively calling
|
||||
# tools or receiving API responses. Only fires when the agent has
|
||||
# been completely idle for this duration. 0 = unlimited.
|
||||
"gateway_timeout": 1800,
|
||||
# Tool-use enforcement: injects system prompt guidance that tells the
|
||||
# model to actually call tools instead of describing intended actions.
|
||||
# Values: "auto" (default — applies to gpt/codex models), true/false
|
||||
|
||||
Reference in New Issue
Block a user