Files
hermes-agent/gateway/status.py
teknium1 eb8316ea69 fix: harden gateway restart recovery
- store gateway PID metadata and validate the live process before trusting gateway.pid
- auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes
- sweep stray manual gateway processes after service stops
- add regression tests for PID validation and service drift recovery
2026-03-14 07:42:31 -07:00

154 lines
4.3 KiB
Python

"""
Gateway runtime status helpers.
Provides PID-file based detection of whether the gateway daemon is running,
used by send_message's check_fn to gate availability in the CLI.
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
``~/.hermes`` but can be overridden via the environment variable. This means
separate HERMES_HOME directories naturally get separate PID files — a property
that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
"""
import json
import os
import sys
from pathlib import Path
from typing import Optional
_GATEWAY_KIND = "hermes-gateway"
def _get_pid_path() -> Path:
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
return home / "gateway.pid"
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
stat_path = Path(f"/proc/{pid}/stat")
try:
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
return int(stat_path.read_text().split()[21])
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
return None
def _read_process_cmdline(pid: int) -> Optional[str]:
"""Return the process command line as a space-separated string."""
cmdline_path = Path(f"/proc/{pid}/cmdline")
try:
raw = cmdline_path.read_bytes()
except (FileNotFoundError, PermissionError, OSError):
return None
if not raw:
return None
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
def _looks_like_gateway_process(pid: int) -> bool:
"""Return True when the live PID still looks like the Hermes gateway."""
cmdline = _read_process_cmdline(pid)
if not cmdline:
# If we cannot inspect the process, fall back to the liveness check.
return True
patterns = (
"hermes_cli.main gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
def _build_pid_record() -> dict:
return {
"pid": os.getpid(),
"kind": _GATEWAY_KIND,
"argv": list(sys.argv),
"start_time": _get_process_start_time(os.getpid()),
}
def _read_pid_record() -> Optional[dict]:
pid_path = _get_pid_path()
if not pid_path.exists():
return None
raw = pid_path.read_text().strip()
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
try:
return {"pid": int(raw)}
except ValueError:
return None
if isinstance(payload, int):
return {"pid": payload}
if isinstance(payload, dict):
return payload
return None
def write_pid_file() -> None:
"""Write the current process PID and metadata to the gateway PID file."""
pid_path = _get_pid_path()
pid_path.parent.mkdir(parents=True, exist_ok=True)
pid_path.write_text(json.dumps(_build_pid_record()))
def remove_pid_file() -> None:
"""Remove the gateway PID file if it exists."""
try:
_get_pid_path().unlink(missing_ok=True)
except Exception:
pass
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.
Checks the PID file and verifies the process is actually alive.
Cleans up stale PID files automatically.
"""
record = _read_pid_record()
if not record:
remove_pid_file()
return None
try:
pid = int(record["pid"])
except (KeyError, TypeError, ValueError):
remove_pid_file()
return None
try:
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
except (ProcessLookupError, PermissionError):
remove_pid_file()
return None
recorded_start = record.get("start_time")
current_start = _get_process_start_time(pid)
if recorded_start is not None and current_start is not None and current_start != recorded_start:
remove_pid_file()
return None
if not _looks_like_gateway_process(pid):
remove_pid_file()
return None
return pid
def is_gateway_running() -> bool:
"""Check if the gateway daemon is currently running."""
return get_running_pid() is not None