fix: harden gateway restart recovery

- store gateway PID metadata and validate the live process before trusting gateway.pid
- auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes
- sweep stray manual gateway processes after service stops
- add regression tests for PID validation and service drift recovery
This commit is contained in:
teknium1
2026-03-14 07:42:31 -07:00
parent 917adcbaf4
commit eb8316ea69
4 changed files with 251 additions and 15 deletions

View File

@@ -11,10 +11,14 @@ that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
"""
import json
import os
import sys
from pathlib import Path
from typing import Optional
_GATEWAY_KIND = "hermes-gateway"
def _get_pid_path() -> Path:
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
@@ -22,11 +26,82 @@ def _get_pid_path() -> Path:
return home / "gateway.pid"
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
stat_path = Path(f"/proc/{pid}/stat")
try:
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
return int(stat_path.read_text().split()[21])
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
return None
def _read_process_cmdline(pid: int) -> Optional[str]:
"""Return the process command line as a space-separated string."""
cmdline_path = Path(f"/proc/{pid}/cmdline")
try:
raw = cmdline_path.read_bytes()
except (FileNotFoundError, PermissionError, OSError):
return None
if not raw:
return None
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
def _looks_like_gateway_process(pid: int) -> bool:
"""Return True when the live PID still looks like the Hermes gateway."""
cmdline = _read_process_cmdline(pid)
if not cmdline:
# If we cannot inspect the process, fall back to the liveness check.
return True
patterns = (
"hermes_cli.main gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
def _build_pid_record() -> dict:
return {
"pid": os.getpid(),
"kind": _GATEWAY_KIND,
"argv": list(sys.argv),
"start_time": _get_process_start_time(os.getpid()),
}
def _read_pid_record() -> Optional[dict]:
pid_path = _get_pid_path()
if not pid_path.exists():
return None
raw = pid_path.read_text().strip()
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
try:
return {"pid": int(raw)}
except ValueError:
return None
if isinstance(payload, int):
return {"pid": payload}
if isinstance(payload, dict):
return payload
return None
def write_pid_file() -> None:
"""Write the current process PID to the gateway PID file."""
"""Write the current process PID and metadata to the gateway PID file."""
pid_path = _get_pid_path()
pid_path.parent.mkdir(parents=True, exist_ok=True)
pid_path.write_text(str(os.getpid()))
pid_path.write_text(json.dumps(_build_pid_record()))
def remove_pid_file() -> None:
@@ -43,18 +118,35 @@ def get_running_pid() -> Optional[int]:
Checks the PID file and verifies the process is actually alive.
Cleans up stale PID files automatically.
"""
pid_path = _get_pid_path()
if not pid_path.exists():
return None
try:
pid = int(pid_path.read_text().strip())
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
return pid
except (ValueError, ProcessLookupError, PermissionError):
# Stale PID file — process is gone
record = _read_pid_record()
if not record:
remove_pid_file()
return None
try:
pid = int(record["pid"])
except (KeyError, TypeError, ValueError):
remove_pid_file()
return None
try:
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
except (ProcessLookupError, PermissionError):
remove_pid_file()
return None
recorded_start = record.get("start_time")
current_start = _get_process_start_time(pid)
if recorded_start is not None and current_start is not None and current_start != recorded_start:
remove_pid_file()
return None
if not _looks_like_gateway_process(pid):
remove_pid_file()
return None
return pid
def is_gateway_running() -> bool:
"""Check if the gateway daemon is currently running."""

View File

@@ -251,6 +251,34 @@ StandardError=journal
WantedBy=default.target
"""
def _normalize_service_definition(text: str) -> str:
return "\n".join(line.rstrip() for line in text.strip().splitlines())
def systemd_unit_is_current() -> bool:
unit_path = get_systemd_unit_path()
if not unit_path.exists():
return False
installed = unit_path.read_text(encoding="utf-8")
expected = generate_systemd_unit()
return _normalize_service_definition(installed) == _normalize_service_definition(expected)
def refresh_systemd_unit_if_needed() -> bool:
"""Rewrite the installed user unit when the generated definition has changed."""
unit_path = get_systemd_unit_path()
if not unit_path.exists() or systemd_unit_is_current():
return False
unit_path.write_text(generate_systemd_unit(), encoding="utf-8")
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
print("↻ Updated gateway service definition to match the current Hermes install")
return True
def systemd_install(force: bool = False):
unit_path = get_systemd_unit_path()
@@ -289,17 +317,22 @@ def systemd_uninstall():
print("✓ Service uninstalled")
def systemd_start():
refresh_systemd_unit_if_needed()
subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True)
print("✓ Service started")
def systemd_stop():
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True)
print("✓ Service stopped")
def systemd_restart():
refresh_systemd_unit_if_needed()
subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True)
print("✓ Service restarted")
def systemd_status(deep: bool = False):
# Check if service unit file exists
unit_path = get_systemd_unit_path()
@@ -308,6 +341,11 @@ def systemd_status(deep: bool = False):
print(" Run: hermes gateway install")
return
if not systemd_unit_is_current():
print("⚠ Installed gateway service definition is outdated")
print(" Run: hermes gateway restart # auto-refreshes the unit")
print()
# Show detailed status first
subprocess.run(
["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"],
@@ -1079,7 +1117,7 @@ def gateway_command(args):
sys.exit(1)
elif subcmd == "stop":
# Try service first, fall back to killing processes directly
# Try service first, then sweep any stray/manual gateway processes.
service_available = False
if is_linux() and get_systemd_unit_path().exists():
@@ -1094,14 +1132,15 @@ def gateway_command(args):
service_available = True
except subprocess.CalledProcessError:
pass
killed = kill_gateway_processes()
if not service_available:
# Kill gateway processes directly
killed = kill_gateway_processes()
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
else:
print("✗ No gateway processes found")
elif killed:
print(f"✓ Stopped {killed} additional manual gateway process(es)")
elif subcmd == "restart":
# Try service first, fall back to killing and restarting

View File

@@ -0,0 +1,27 @@
"""Tests for gateway runtime status tracking."""
import json
import os
from gateway import status
class TestGatewayPidState:
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_pid_file()
payload = json.loads((tmp_path / "gateway.pid").read_text())
assert payload["pid"] == os.getpid()
assert payload["kind"] == "hermes-gateway"
assert isinstance(payload["argv"], list)
assert payload["argv"]
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
pid_path.write_text(str(os.getpid()))
assert status.get_running_pid() is None
assert not pid_path.exists()

View File

@@ -0,0 +1,78 @@
"""Tests for gateway service management helpers."""
from types import SimpleNamespace
import hermes_cli.gateway as gateway_cli
class TestSystemdServiceRefresh:
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("old unit\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
calls = []
def fake_run(cmd, check=True, **kwargs):
calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.systemd_start()
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "start", gateway_cli.SERVICE_NAME],
]
def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("old unit\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
calls = []
def fake_run(cmd, check=True, **kwargs):
calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.systemd_restart()
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME],
]
class TestGatewayStopCleanup:
def test_stop_sweeps_manual_gateway_processes_after_service_stop(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("unit\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
service_calls = []
kill_calls = []
monkeypatch.setattr(gateway_cli, "systemd_stop", lambda: service_calls.append("stop"))
monkeypatch.setattr(
gateway_cli,
"kill_gateway_processes",
lambda force=False: kill_calls.append(force) or 2,
)
gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop"))
assert service_calls == ["stop"]
assert kill_calls == [False]