From eb8316ea69896aebe917b7b4db4b795929cfc0cb Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 07:42:31 -0700 Subject: [PATCH] fix: harden gateway restart recovery - store gateway PID metadata and validate the live process before trusting gateway.pid - auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes - sweep stray manual gateway processes after service stops - add regression tests for PID validation and service drift recovery --- gateway/status.py | 114 ++++++++++++++++++++--- hermes_cli/gateway.py | 47 +++++++++- tests/gateway/test_status.py | 27 ++++++ tests/hermes_cli/test_gateway_service.py | 78 ++++++++++++++++ 4 files changed, 251 insertions(+), 15 deletions(-) create mode 100644 tests/gateway/test_status.py create mode 100644 tests/hermes_cli/test_gateway_service.py diff --git a/gateway/status.py b/gateway/status.py index 78d71947f..db72f1fed 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -11,10 +11,14 @@ that will be useful when we add named profiles (multiple agents running concurrently under distinct configurations). """ +import json import os +import sys from pathlib import Path from typing import Optional +_GATEWAY_KIND = "hermes-gateway" + def _get_pid_path() -> Path: """Return the path to the gateway PID file, respecting HERMES_HOME.""" @@ -22,11 +26,82 @@ def _get_pid_path() -> Path: return home / "gateway.pid" +def _get_process_start_time(pid: int) -> Optional[int]: + """Return the kernel start time for a process when available.""" + stat_path = Path(f"/proc/{pid}/stat") + try: + # Field 22 in /proc//stat is process start time (clock ticks). + return int(stat_path.read_text().split()[21]) + except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError): + return None + + +def _read_process_cmdline(pid: int) -> Optional[str]: + """Return the process command line as a space-separated string.""" + cmdline_path = Path(f"/proc/{pid}/cmdline") + try: + raw = cmdline_path.read_bytes() + except (FileNotFoundError, PermissionError, OSError): + return None + + if not raw: + return None + return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip() + + +def _looks_like_gateway_process(pid: int) -> bool: + """Return True when the live PID still looks like the Hermes gateway.""" + cmdline = _read_process_cmdline(pid) + if not cmdline: + # If we cannot inspect the process, fall back to the liveness check. + return True + + patterns = ( + "hermes_cli.main gateway", + "hermes gateway", + "gateway/run.py", + ) + return any(pattern in cmdline for pattern in patterns) + + +def _build_pid_record() -> dict: + return { + "pid": os.getpid(), + "kind": _GATEWAY_KIND, + "argv": list(sys.argv), + "start_time": _get_process_start_time(os.getpid()), + } + + +def _read_pid_record() -> Optional[dict]: + pid_path = _get_pid_path() + if not pid_path.exists(): + return None + + raw = pid_path.read_text().strip() + if not raw: + return None + + try: + payload = json.loads(raw) + except json.JSONDecodeError: + try: + return {"pid": int(raw)} + except ValueError: + return None + + if isinstance(payload, int): + return {"pid": payload} + if isinstance(payload, dict): + return payload + return None + + def write_pid_file() -> None: - """Write the current process PID to the gateway PID file.""" + """Write the current process PID and metadata to the gateway PID file.""" pid_path = _get_pid_path() pid_path.parent.mkdir(parents=True, exist_ok=True) - pid_path.write_text(str(os.getpid())) + pid_path.write_text(json.dumps(_build_pid_record())) def remove_pid_file() -> None: @@ -43,18 +118,35 @@ def get_running_pid() -> Optional[int]: Checks the PID file and verifies the process is actually alive. Cleans up stale PID files automatically. """ - pid_path = _get_pid_path() - if not pid_path.exists(): - return None - try: - pid = int(pid_path.read_text().strip()) - os.kill(pid, 0) # signal 0 = existence check, no actual signal sent - return pid - except (ValueError, ProcessLookupError, PermissionError): - # Stale PID file — process is gone + record = _read_pid_record() + if not record: remove_pid_file() return None + try: + pid = int(record["pid"]) + except (KeyError, TypeError, ValueError): + remove_pid_file() + return None + + try: + os.kill(pid, 0) # signal 0 = existence check, no actual signal sent + except (ProcessLookupError, PermissionError): + remove_pid_file() + return None + + recorded_start = record.get("start_time") + current_start = _get_process_start_time(pid) + if recorded_start is not None and current_start is not None and current_start != recorded_start: + remove_pid_file() + return None + + if not _looks_like_gateway_process(pid): + remove_pid_file() + return None + + return pid + def is_gateway_running() -> bool: """Check if the gateway daemon is currently running.""" diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 0529dbb85..4d3ed8845 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -251,6 +251,34 @@ StandardError=journal WantedBy=default.target """ + +def _normalize_service_definition(text: str) -> str: + return "\n".join(line.rstrip() for line in text.strip().splitlines()) + + +def systemd_unit_is_current() -> bool: + unit_path = get_systemd_unit_path() + if not unit_path.exists(): + return False + + installed = unit_path.read_text(encoding="utf-8") + expected = generate_systemd_unit() + return _normalize_service_definition(installed) == _normalize_service_definition(expected) + + + +def refresh_systemd_unit_if_needed() -> bool: + """Rewrite the installed user unit when the generated definition has changed.""" + unit_path = get_systemd_unit_path() + if not unit_path.exists() or systemd_unit_is_current(): + return False + + unit_path.write_text(generate_systemd_unit(), encoding="utf-8") + subprocess.run(["systemctl", "--user", "daemon-reload"], check=True) + print("↻ Updated gateway service definition to match the current Hermes install") + return True + + def systemd_install(force: bool = False): unit_path = get_systemd_unit_path() @@ -289,17 +317,22 @@ def systemd_uninstall(): print("✓ Service uninstalled") def systemd_start(): + refresh_systemd_unit_if_needed() subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True) print("✓ Service started") + def systemd_stop(): subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True) print("✓ Service stopped") + def systemd_restart(): + refresh_systemd_unit_if_needed() subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True) print("✓ Service restarted") + def systemd_status(deep: bool = False): # Check if service unit file exists unit_path = get_systemd_unit_path() @@ -308,6 +341,11 @@ def systemd_status(deep: bool = False): print(" Run: hermes gateway install") return + if not systemd_unit_is_current(): + print("⚠ Installed gateway service definition is outdated") + print(" Run: hermes gateway restart # auto-refreshes the unit") + print() + # Show detailed status first subprocess.run( ["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"], @@ -1079,7 +1117,7 @@ def gateway_command(args): sys.exit(1) elif subcmd == "stop": - # Try service first, fall back to killing processes directly + # Try service first, then sweep any stray/manual gateway processes. service_available = False if is_linux() and get_systemd_unit_path().exists(): @@ -1094,14 +1132,15 @@ def gateway_command(args): service_available = True except subprocess.CalledProcessError: pass - + + killed = kill_gateway_processes() if not service_available: - # Kill gateway processes directly - killed = kill_gateway_processes() if killed: print(f"✓ Stopped {killed} gateway process(es)") else: print("✗ No gateway processes found") + elif killed: + print(f"✓ Stopped {killed} additional manual gateway process(es)") elif subcmd == "restart": # Try service first, fall back to killing and restarting diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py new file mode 100644 index 000000000..025708a53 --- /dev/null +++ b/tests/gateway/test_status.py @@ -0,0 +1,27 @@ +"""Tests for gateway runtime status tracking.""" + +import json +import os + +from gateway import status + + +class TestGatewayPidState: + def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + status.write_pid_file() + + payload = json.loads((tmp_path / "gateway.pid").read_text()) + assert payload["pid"] == os.getpid() + assert payload["kind"] == "hermes-gateway" + assert isinstance(payload["argv"], list) + assert payload["argv"] + + def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + pid_path = tmp_path / "gateway.pid" + pid_path.write_text(str(os.getpid())) + + assert status.get_running_pid() is None + assert not pid_path.exists() diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py new file mode 100644 index 000000000..4f8eb39a2 --- /dev/null +++ b/tests/hermes_cli/test_gateway_service.py @@ -0,0 +1,78 @@ +"""Tests for gateway service management helpers.""" + +from types import SimpleNamespace + +import hermes_cli.gateway as gateway_cli + + +class TestSystemdServiceRefresh: + def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch): + unit_path = tmp_path / "hermes-gateway.service" + unit_path.write_text("old unit\n", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path) + monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n") + + calls = [] + + def fake_run(cmd, check=True, **kwargs): + calls.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + gateway_cli.systemd_start() + + assert unit_path.read_text(encoding="utf-8") == "new unit\n" + assert calls[:2] == [ + ["systemctl", "--user", "daemon-reload"], + ["systemctl", "--user", "start", gateway_cli.SERVICE_NAME], + ] + + def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch): + unit_path = tmp_path / "hermes-gateway.service" + unit_path.write_text("old unit\n", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path) + monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n") + + calls = [] + + def fake_run(cmd, check=True, **kwargs): + calls.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + gateway_cli.systemd_restart() + + assert unit_path.read_text(encoding="utf-8") == "new unit\n" + assert calls[:2] == [ + ["systemctl", "--user", "daemon-reload"], + ["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME], + ] + + +class TestGatewayStopCleanup: + def test_stop_sweeps_manual_gateway_processes_after_service_stop(self, tmp_path, monkeypatch): + unit_path = tmp_path / "hermes-gateway.service" + unit_path.write_text("unit\n", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "is_linux", lambda: True) + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path) + + service_calls = [] + kill_calls = [] + + monkeypatch.setattr(gateway_cli, "systemd_stop", lambda: service_calls.append("stop")) + monkeypatch.setattr( + gateway_cli, + "kill_gateway_processes", + lambda force=False: kill_calls.append(force) or 2, + ) + + gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop")) + + assert service_calls == ["stop"] + assert kill_calls == [False]