fix: harden gateway restart recovery
- store gateway PID metadata and validate the live process before trusting gateway.pid - auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes - sweep stray manual gateway processes after service stops - add regression tests for PID validation and service drift recovery
This commit is contained in:
@@ -11,10 +11,14 @@ that will be useful when we add named profiles (multiple agents running
|
||||
concurrently under distinct configurations).
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
_GATEWAY_KIND = "hermes-gateway"
|
||||
|
||||
|
||||
def _get_pid_path() -> Path:
|
||||
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
||||
@@ -22,11 +26,82 @@ def _get_pid_path() -> Path:
|
||||
return home / "gateway.pid"
|
||||
|
||||
|
||||
def _get_process_start_time(pid: int) -> Optional[int]:
|
||||
"""Return the kernel start time for a process when available."""
|
||||
stat_path = Path(f"/proc/{pid}/stat")
|
||||
try:
|
||||
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
||||
return int(stat_path.read_text().split()[21])
|
||||
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def _read_process_cmdline(pid: int) -> Optional[str]:
|
||||
"""Return the process command line as a space-separated string."""
|
||||
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
||||
try:
|
||||
raw = cmdline_path.read_bytes()
|
||||
except (FileNotFoundError, PermissionError, OSError):
|
||||
return None
|
||||
|
||||
if not raw:
|
||||
return None
|
||||
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
||||
|
||||
|
||||
def _looks_like_gateway_process(pid: int) -> bool:
|
||||
"""Return True when the live PID still looks like the Hermes gateway."""
|
||||
cmdline = _read_process_cmdline(pid)
|
||||
if not cmdline:
|
||||
# If we cannot inspect the process, fall back to the liveness check.
|
||||
return True
|
||||
|
||||
patterns = (
|
||||
"hermes_cli.main gateway",
|
||||
"hermes gateway",
|
||||
"gateway/run.py",
|
||||
)
|
||||
return any(pattern in cmdline for pattern in patterns)
|
||||
|
||||
|
||||
def _build_pid_record() -> dict:
|
||||
return {
|
||||
"pid": os.getpid(),
|
||||
"kind": _GATEWAY_KIND,
|
||||
"argv": list(sys.argv),
|
||||
"start_time": _get_process_start_time(os.getpid()),
|
||||
}
|
||||
|
||||
|
||||
def _read_pid_record() -> Optional[dict]:
|
||||
pid_path = _get_pid_path()
|
||||
if not pid_path.exists():
|
||||
return None
|
||||
|
||||
raw = pid_path.read_text().strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
return {"pid": int(raw)}
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
if isinstance(payload, int):
|
||||
return {"pid": payload}
|
||||
if isinstance(payload, dict):
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def write_pid_file() -> None:
|
||||
"""Write the current process PID to the gateway PID file."""
|
||||
"""Write the current process PID and metadata to the gateway PID file."""
|
||||
pid_path = _get_pid_path()
|
||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pid_path.write_text(str(os.getpid()))
|
||||
pid_path.write_text(json.dumps(_build_pid_record()))
|
||||
|
||||
|
||||
def remove_pid_file() -> None:
|
||||
@@ -43,18 +118,35 @@ def get_running_pid() -> Optional[int]:
|
||||
Checks the PID file and verifies the process is actually alive.
|
||||
Cleans up stale PID files automatically.
|
||||
"""
|
||||
pid_path = _get_pid_path()
|
||||
if not pid_path.exists():
|
||||
return None
|
||||
try:
|
||||
pid = int(pid_path.read_text().strip())
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
return pid
|
||||
except (ValueError, ProcessLookupError, PermissionError):
|
||||
# Stale PID file — process is gone
|
||||
record = _read_pid_record()
|
||||
if not record:
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
try:
|
||||
pid = int(record["pid"])
|
||||
except (KeyError, TypeError, ValueError):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
except (ProcessLookupError, PermissionError):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
recorded_start = record.get("start_time")
|
||||
current_start = _get_process_start_time(pid)
|
||||
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
if not _looks_like_gateway_process(pid):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
return pid
|
||||
|
||||
|
||||
def is_gateway_running() -> bool:
|
||||
"""Check if the gateway daemon is currently running."""
|
||||
|
||||
@@ -251,6 +251,34 @@ StandardError=journal
|
||||
WantedBy=default.target
|
||||
"""
|
||||
|
||||
|
||||
def _normalize_service_definition(text: str) -> str:
|
||||
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||
|
||||
|
||||
def systemd_unit_is_current() -> bool:
|
||||
unit_path = get_systemd_unit_path()
|
||||
if not unit_path.exists():
|
||||
return False
|
||||
|
||||
installed = unit_path.read_text(encoding="utf-8")
|
||||
expected = generate_systemd_unit()
|
||||
return _normalize_service_definition(installed) == _normalize_service_definition(expected)
|
||||
|
||||
|
||||
|
||||
def refresh_systemd_unit_if_needed() -> bool:
|
||||
"""Rewrite the installed user unit when the generated definition has changed."""
|
||||
unit_path = get_systemd_unit_path()
|
||||
if not unit_path.exists() or systemd_unit_is_current():
|
||||
return False
|
||||
|
||||
unit_path.write_text(generate_systemd_unit(), encoding="utf-8")
|
||||
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
|
||||
print("↻ Updated gateway service definition to match the current Hermes install")
|
||||
return True
|
||||
|
||||
|
||||
def systemd_install(force: bool = False):
|
||||
unit_path = get_systemd_unit_path()
|
||||
|
||||
@@ -289,17 +317,22 @@ def systemd_uninstall():
|
||||
print("✓ Service uninstalled")
|
||||
|
||||
def systemd_start():
|
||||
refresh_systemd_unit_if_needed()
|
||||
subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True)
|
||||
print("✓ Service started")
|
||||
|
||||
|
||||
def systemd_stop():
|
||||
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True)
|
||||
print("✓ Service stopped")
|
||||
|
||||
|
||||
def systemd_restart():
|
||||
refresh_systemd_unit_if_needed()
|
||||
subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True)
|
||||
print("✓ Service restarted")
|
||||
|
||||
|
||||
def systemd_status(deep: bool = False):
|
||||
# Check if service unit file exists
|
||||
unit_path = get_systemd_unit_path()
|
||||
@@ -308,6 +341,11 @@ def systemd_status(deep: bool = False):
|
||||
print(" Run: hermes gateway install")
|
||||
return
|
||||
|
||||
if not systemd_unit_is_current():
|
||||
print("⚠ Installed gateway service definition is outdated")
|
||||
print(" Run: hermes gateway restart # auto-refreshes the unit")
|
||||
print()
|
||||
|
||||
# Show detailed status first
|
||||
subprocess.run(
|
||||
["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"],
|
||||
@@ -1079,7 +1117,7 @@ def gateway_command(args):
|
||||
sys.exit(1)
|
||||
|
||||
elif subcmd == "stop":
|
||||
# Try service first, fall back to killing processes directly
|
||||
# Try service first, then sweep any stray/manual gateway processes.
|
||||
service_available = False
|
||||
|
||||
if is_linux() and get_systemd_unit_path().exists():
|
||||
@@ -1094,14 +1132,15 @@ def gateway_command(args):
|
||||
service_available = True
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
|
||||
killed = kill_gateway_processes()
|
||||
if not service_available:
|
||||
# Kill gateway processes directly
|
||||
killed = kill_gateway_processes()
|
||||
if killed:
|
||||
print(f"✓ Stopped {killed} gateway process(es)")
|
||||
else:
|
||||
print("✗ No gateway processes found")
|
||||
elif killed:
|
||||
print(f"✓ Stopped {killed} additional manual gateway process(es)")
|
||||
|
||||
elif subcmd == "restart":
|
||||
# Try service first, fall back to killing and restarting
|
||||
|
||||
27
tests/gateway/test_status.py
Normal file
27
tests/gateway/test_status.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Tests for gateway runtime status tracking."""
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from gateway import status
|
||||
|
||||
|
||||
class TestGatewayPidState:
|
||||
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
status.write_pid_file()
|
||||
|
||||
payload = json.loads((tmp_path / "gateway.pid").read_text())
|
||||
assert payload["pid"] == os.getpid()
|
||||
assert payload["kind"] == "hermes-gateway"
|
||||
assert isinstance(payload["argv"], list)
|
||||
assert payload["argv"]
|
||||
|
||||
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
pid_path.write_text(str(os.getpid()))
|
||||
|
||||
assert status.get_running_pid() is None
|
||||
assert not pid_path.exists()
|
||||
78
tests/hermes_cli/test_gateway_service.py
Normal file
78
tests/hermes_cli/test_gateway_service.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Tests for gateway service management helpers."""
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
import hermes_cli.gateway as gateway_cli
|
||||
|
||||
|
||||
class TestSystemdServiceRefresh:
|
||||
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
||||
unit_path = tmp_path / "hermes-gateway.service"
|
||||
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
|
||||
|
||||
calls = []
|
||||
|
||||
def fake_run(cmd, check=True, **kwargs):
|
||||
calls.append(cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
gateway_cli.systemd_start()
|
||||
|
||||
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||
assert calls[:2] == [
|
||||
["systemctl", "--user", "daemon-reload"],
|
||||
["systemctl", "--user", "start", gateway_cli.SERVICE_NAME],
|
||||
]
|
||||
|
||||
def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
||||
unit_path = tmp_path / "hermes-gateway.service"
|
||||
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
|
||||
|
||||
calls = []
|
||||
|
||||
def fake_run(cmd, check=True, **kwargs):
|
||||
calls.append(cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
gateway_cli.systemd_restart()
|
||||
|
||||
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||
assert calls[:2] == [
|
||||
["systemctl", "--user", "daemon-reload"],
|
||||
["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME],
|
||||
]
|
||||
|
||||
|
||||
class TestGatewayStopCleanup:
|
||||
def test_stop_sweeps_manual_gateway_processes_after_service_stop(self, tmp_path, monkeypatch):
|
||||
unit_path = tmp_path / "hermes-gateway.service"
|
||||
unit_path.write_text("unit\n", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||
|
||||
service_calls = []
|
||||
kill_calls = []
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "systemd_stop", lambda: service_calls.append("stop"))
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"kill_gateway_processes",
|
||||
lambda force=False: kill_calls.append(force) or 2,
|
||||
)
|
||||
|
||||
gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop"))
|
||||
|
||||
assert service_calls == ["stop"]
|
||||
assert kill_calls == [False]
|
||||
Reference in New Issue
Block a user