- store gateway PID metadata and validate the live process before trusting gateway.pid - auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes - sweep stray manual gateway processes after service stops - add regression tests for PID validation and service drift recovery
154 lines
4.3 KiB
Python
154 lines
4.3 KiB
Python
"""
|
|
Gateway runtime status helpers.
|
|
|
|
Provides PID-file based detection of whether the gateway daemon is running,
|
|
used by send_message's check_fn to gate availability in the CLI.
|
|
|
|
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
|
|
``~/.hermes`` but can be overridden via the environment variable. This means
|
|
separate HERMES_HOME directories naturally get separate PID files — a property
|
|
that will be useful when we add named profiles (multiple agents running
|
|
concurrently under distinct configurations).
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
_GATEWAY_KIND = "hermes-gateway"
|
|
|
|
|
|
def _get_pid_path() -> Path:
|
|
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
|
home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
|
return home / "gateway.pid"
|
|
|
|
|
|
def _get_process_start_time(pid: int) -> Optional[int]:
|
|
"""Return the kernel start time for a process when available."""
|
|
stat_path = Path(f"/proc/{pid}/stat")
|
|
try:
|
|
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
|
return int(stat_path.read_text().split()[21])
|
|
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
|
return None
|
|
|
|
|
|
def _read_process_cmdline(pid: int) -> Optional[str]:
|
|
"""Return the process command line as a space-separated string."""
|
|
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
|
try:
|
|
raw = cmdline_path.read_bytes()
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
return None
|
|
|
|
if not raw:
|
|
return None
|
|
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
|
|
|
|
|
def _looks_like_gateway_process(pid: int) -> bool:
|
|
"""Return True when the live PID still looks like the Hermes gateway."""
|
|
cmdline = _read_process_cmdline(pid)
|
|
if not cmdline:
|
|
# If we cannot inspect the process, fall back to the liveness check.
|
|
return True
|
|
|
|
patterns = (
|
|
"hermes_cli.main gateway",
|
|
"hermes gateway",
|
|
"gateway/run.py",
|
|
)
|
|
return any(pattern in cmdline for pattern in patterns)
|
|
|
|
|
|
def _build_pid_record() -> dict:
|
|
return {
|
|
"pid": os.getpid(),
|
|
"kind": _GATEWAY_KIND,
|
|
"argv": list(sys.argv),
|
|
"start_time": _get_process_start_time(os.getpid()),
|
|
}
|
|
|
|
|
|
def _read_pid_record() -> Optional[dict]:
|
|
pid_path = _get_pid_path()
|
|
if not pid_path.exists():
|
|
return None
|
|
|
|
raw = pid_path.read_text().strip()
|
|
if not raw:
|
|
return None
|
|
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
try:
|
|
return {"pid": int(raw)}
|
|
except ValueError:
|
|
return None
|
|
|
|
if isinstance(payload, int):
|
|
return {"pid": payload}
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return None
|
|
|
|
|
|
def write_pid_file() -> None:
|
|
"""Write the current process PID and metadata to the gateway PID file."""
|
|
pid_path = _get_pid_path()
|
|
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
|
pid_path.write_text(json.dumps(_build_pid_record()))
|
|
|
|
|
|
def remove_pid_file() -> None:
|
|
"""Remove the gateway PID file if it exists."""
|
|
try:
|
|
_get_pid_path().unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def get_running_pid() -> Optional[int]:
|
|
"""Return the PID of a running gateway instance, or ``None``.
|
|
|
|
Checks the PID file and verifies the process is actually alive.
|
|
Cleans up stale PID files automatically.
|
|
"""
|
|
record = _read_pid_record()
|
|
if not record:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
pid = int(record["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
|
except (ProcessLookupError, PermissionError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
recorded_start = record.get("start_time")
|
|
current_start = _get_process_start_time(pid)
|
|
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
if not _looks_like_gateway_process(pid):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
return pid
|
|
|
|
|
|
def is_gateway_running() -> bool:
|
|
"""Check if the gateway daemon is currently running."""
|
|
return get_running_pid() is not None
|