Files
hermes-agent/gateway/status.py
teknium1 5a2fcaab39 fix(gateway): harden Telegram polling conflict handling
- detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever
- add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once
- persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration
     Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled)
     Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago
 Invocation: 8879379b25994201b98381f4bd80c2af
   Main PID: 1147926 (python)
      Tasks: 16 (limit: 76757)
     Memory: 151.4M (peak: 168.1M)
        CPU: 47.883s
     CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service
             ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace
             └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat

Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)...
Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data]
Mar 14 09:27:04 teknium-dev python[1147926]:    Content: ''
Mar 14 09:27:04 teknium-dev python[1147926]:  Max retries (3) for empty content exceeded.
Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data]
Mar 14 09:27:07 teknium-dev python[1147926]:    Content: ''
Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)...
Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data]
Mar 14 09:27:12 teknium-dev python[1147926]:    Content: ''
Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)...
⚠ Installed gateway service definition is outdated
  Run: hermes gateway restart  # auto-refreshes the unit

✓ Gateway service is running
✓ Systemd linger is enabled (service survives logout)
- cleanly exit non-retryable startup conflicts without triggering service restart loops

Tests:
- gateway status runtime-state helpers
- Telegram token-lock and polling-conflict behavior
- GatewayRunner clean exit on non-retryable startup conflict
- CLI runtime health summary
2026-03-14 12:11:23 -07:00

337 lines
10 KiB
Python

"""
Gateway runtime status helpers.
Provides PID-file based detection of whether the gateway daemon is running,
used by send_message's check_fn to gate availability in the CLI.
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
``~/.hermes`` but can be overridden via the environment variable. This means
separate HERMES_HOME directories naturally get separate PID files — a property
that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
"""
import hashlib
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
_GATEWAY_KIND = "hermes-gateway"
_RUNTIME_STATUS_FILE = "gateway_state.json"
_LOCKS_DIRNAME = "gateway-locks"
def _get_pid_path() -> Path:
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
return home / "gateway.pid"
def _get_runtime_status_path() -> Path:
"""Return the persisted runtime health/status file path."""
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
def _get_lock_dir() -> Path:
"""Return the machine-local directory for token-scoped gateway locks."""
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
if override:
return Path(override)
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
return state_home / "hermes" / _LOCKS_DIRNAME
def _utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _scope_hash(identity: str) -> str:
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
def _get_scope_lock_path(scope: str, identity: str) -> Path:
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
stat_path = Path(f"/proc/{pid}/stat")
try:
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
return int(stat_path.read_text().split()[21])
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
return None
def _read_process_cmdline(pid: int) -> Optional[str]:
"""Return the process command line as a space-separated string."""
cmdline_path = Path(f"/proc/{pid}/cmdline")
try:
raw = cmdline_path.read_bytes()
except (FileNotFoundError, PermissionError, OSError):
return None
if not raw:
return None
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
def _looks_like_gateway_process(pid: int) -> bool:
"""Return True when the live PID still looks like the Hermes gateway."""
cmdline = _read_process_cmdline(pid)
if not cmdline:
# If we cannot inspect the process, fall back to the liveness check.
return True
patterns = (
"hermes_cli.main gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
def _build_pid_record() -> dict:
return {
"pid": os.getpid(),
"kind": _GATEWAY_KIND,
"argv": list(sys.argv),
"start_time": _get_process_start_time(os.getpid()),
}
def _build_runtime_status_record() -> dict[str, Any]:
payload = _build_pid_record()
payload.update({
"gateway_state": "starting",
"exit_reason": None,
"platforms": {},
"updated_at": _utc_now_iso(),
})
return payload
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
if not path.exists():
return None
try:
raw = path.read_text().strip()
except OSError:
return None
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
return None
return payload if isinstance(payload, dict) else None
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload))
def _read_pid_record() -> Optional[dict]:
pid_path = _get_pid_path()
if not pid_path.exists():
return None
raw = pid_path.read_text().strip()
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
try:
return {"pid": int(raw)}
except ValueError:
return None
if isinstance(payload, int):
return {"pid": payload}
if isinstance(payload, dict):
return payload
return None
def write_pid_file() -> None:
"""Write the current process PID and metadata to the gateway PID file."""
_write_json_file(_get_pid_path(), _build_pid_record())
def write_runtime_status(
*,
gateway_state: Optional[str] = None,
exit_reason: Optional[str] = None,
platform: Optional[str] = None,
platform_state: Optional[str] = None,
error_code: Optional[str] = None,
error_message: Optional[str] = None,
) -> None:
"""Persist gateway runtime health information for diagnostics/status."""
path = _get_runtime_status_path()
payload = _read_json_file(path) or _build_runtime_status_record()
payload.setdefault("platforms", {})
payload.setdefault("kind", _GATEWAY_KIND)
payload.setdefault("pid", os.getpid())
payload.setdefault("start_time", _get_process_start_time(os.getpid()))
payload["updated_at"] = _utc_now_iso()
if gateway_state is not None:
payload["gateway_state"] = gateway_state
if exit_reason is not None:
payload["exit_reason"] = exit_reason
if platform is not None:
platform_payload = payload["platforms"].get(platform, {})
if platform_state is not None:
platform_payload["state"] = platform_state
if error_code is not None:
platform_payload["error_code"] = error_code
if error_message is not None:
platform_payload["error_message"] = error_message
platform_payload["updated_at"] = _utc_now_iso()
payload["platforms"][platform] = platform_payload
_write_json_file(path, payload)
def read_runtime_status() -> Optional[dict[str, Any]]:
"""Read the persisted gateway runtime health/status information."""
return _read_json_file(_get_runtime_status_path())
def remove_pid_file() -> None:
"""Remove the gateway PID file if it exists."""
try:
_get_pid_path().unlink(missing_ok=True)
except Exception:
pass
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
"""Acquire a machine-local lock keyed by scope + identity.
Used to prevent multiple local gateways from using the same external identity
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
"""
lock_path = _get_scope_lock_path(scope, identity)
lock_path.parent.mkdir(parents=True, exist_ok=True)
record = {
**_build_pid_record(),
"scope": scope,
"identity_hash": _scope_hash(identity),
"metadata": metadata or {},
"updated_at": _utc_now_iso(),
}
existing = _read_json_file(lock_path)
if existing:
try:
existing_pid = int(existing["pid"])
except (KeyError, TypeError, ValueError):
existing_pid = None
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
_write_json_file(lock_path, record)
return True, existing
stale = existing_pid is None
if not stale:
try:
os.kill(existing_pid, 0)
except (ProcessLookupError, PermissionError):
stale = True
else:
current_start = _get_process_start_time(existing_pid)
if (
existing.get("start_time") is not None
and current_start is not None
and current_start != existing.get("start_time")
):
stale = True
if stale:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
else:
return False, existing
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
except FileExistsError:
return False, _read_json_file(lock_path)
try:
with os.fdopen(fd, "w", encoding="utf-8") as handle:
json.dump(record, handle)
except Exception:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
raise
return True, None
def release_scoped_lock(scope: str, identity: str) -> None:
"""Release a previously-acquired scope lock when owned by this process."""
lock_path = _get_scope_lock_path(scope, identity)
existing = _read_json_file(lock_path)
if not existing:
return
if existing.get("pid") != os.getpid():
return
if existing.get("start_time") != _get_process_start_time(os.getpid()):
return
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.
Checks the PID file and verifies the process is actually alive.
Cleans up stale PID files automatically.
"""
record = _read_pid_record()
if not record:
remove_pid_file()
return None
try:
pid = int(record["pid"])
except (KeyError, TypeError, ValueError):
remove_pid_file()
return None
try:
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
except (ProcessLookupError, PermissionError):
remove_pid_file()
return None
recorded_start = record.get("start_time")
current_start = _get_process_start_time(pid)
if recorded_start is not None and current_start is not None and current_start != recorded_start:
remove_pid_file()
return None
if not _looks_like_gateway_process(pid):
remove_pid_file()
return None
return pid
def is_gateway_running() -> bool:
"""Check if the gateway daemon is currently running."""
return get_running_pid() is not None