From ace2cc62575b39c58abffd2bf1d46e0e86114bd1 Mon Sep 17 00:00:00 2001 From: Test Date: Wed, 18 Mar 2026 02:54:18 -0700 Subject: [PATCH] fix(gateway): PID-based wait with force-kill for gateway restart Add _wait_for_gateway_exit() that polls get_running_pid() to confirm the old gateway process has actually exited before starting a new one. If the process doesn't exit within 5s, sends SIGKILL to the specific PID. Uses the saved PID from gateway.pid (not launchd labels) so it works correctly with multiple gateway instances under separate HERMES_HOME directories. Applied to both launchd_restart() and the manual restart path (replaces the blind time.sleep(2)). Inspired by PR #1881 by @AzothZephyr (race condition diagnosis). Adds 4 tests. --- hermes_cli/gateway.py | 48 ++++++++++++++++-- tests/hermes_cli/test_gateway.py | 83 ++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 4 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 1c44e3113..a7876bc40 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -849,6 +849,46 @@ def launchd_stop(): subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True) print("✓ Service stopped") +def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0): + """Wait for the gateway process (by saved PID) to exit. + + Uses the PID from the gateway.pid file — not launchd labels — so this + works correctly when multiple gateway instances run under separate + HERMES_HOME directories. + + Args: + timeout: Total seconds to wait before giving up. + force_after: Seconds of graceful waiting before sending SIGKILL. + """ + import time + from gateway.status import get_running_pid + + deadline = time.monotonic() + timeout + force_deadline = time.monotonic() + force_after + force_sent = False + + while time.monotonic() < deadline: + pid = get_running_pid() + if pid is None: + return # Process exited cleanly. + + if not force_sent and time.monotonic() >= force_deadline: + # Grace period expired — force-kill the specific PID. + try: + os.kill(pid, signal.SIGKILL) + print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL") + except (ProcessLookupError, PermissionError): + return # Already gone or we can't touch it. + force_sent = True + + time.sleep(0.3) + + # Timed out even after SIGKILL. + remaining_pid = get_running_pid() + if remaining_pid is not None: + print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail") + + def launchd_restart(): try: launchd_stop() @@ -856,6 +896,7 @@ def launchd_restart(): if e.returncode != 3: raise print("↻ launchd job was unloaded; skipping stop") + _wait_for_gateway_exit() launchd_start() def launchd_status(deep: bool = False): @@ -1753,10 +1794,9 @@ def gateway_command(args): killed = kill_gateway_processes() if killed: print(f"✓ Stopped {killed} gateway process(es)") - - import time - time.sleep(2) - + + _wait_for_gateway_exit(timeout=10.0, force_after=5.0) + # Start fresh print("Starting gateway...") run_gateway(verbose=False) diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index 52d43fd08..b92f385e2 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -1,6 +1,8 @@ """Tests for hermes_cli.gateway.""" +import signal from types import SimpleNamespace +from unittest.mock import patch, call import hermes_cli.gateway as gateway @@ -169,3 +171,84 @@ def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeyp assert (scope, did_install) == ("system", True) assert calls == [(True, True, "alice")] + + +# --------------------------------------------------------------------------- +# _wait_for_gateway_exit +# --------------------------------------------------------------------------- + + +class TestWaitForGatewayExit: + """PID-based wait with force-kill on timeout.""" + + def test_returns_immediately_when_no_pid(self, monkeypatch): + """If get_running_pid returns None, exit instantly.""" + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + # Should return without sleeping at all. + gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5) + + def test_returns_when_process_exits_gracefully(self, monkeypatch): + """Process exits after a couple of polls — no SIGKILL needed.""" + poll_count = 0 + + def mock_get_running_pid(): + nonlocal poll_count + poll_count += 1 + return 12345 if poll_count <= 2 else None + + monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid) + monkeypatch.setattr("time.sleep", lambda _: None) + + gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0) + # Should have polled until None was returned. + assert poll_count == 3 + + def test_force_kills_after_grace_period(self, monkeypatch): + """When the process doesn't exit, SIGKILL the saved PID.""" + import time as _time + + # Simulate monotonic time advancing past force_after + call_num = 0 + def fake_monotonic(): + nonlocal call_num + call_num += 1 + # First two calls: initial deadline + force_deadline setup (time 0) + # Then each loop iteration advances time + return call_num * 2.0 # 2, 4, 6, 8, ... + + kills = [] + def mock_kill(pid, sig): + kills.append((pid, sig)) + + # get_running_pid returns the PID until kill is sent, then None + def mock_get_running_pid(): + return None if kills else 42 + + monkeypatch.setattr("time.monotonic", fake_monotonic) + monkeypatch.setattr("time.sleep", lambda _: None) + monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid) + monkeypatch.setattr("os.kill", mock_kill) + + gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0) + assert (42, signal.SIGKILL) in kills + + def test_handles_process_already_gone_on_kill(self, monkeypatch): + """ProcessLookupError during SIGKILL is not fatal.""" + import time as _time + + call_num = 0 + def fake_monotonic(): + nonlocal call_num + call_num += 1 + return call_num * 3.0 # Jump past force_after quickly + + def mock_kill(pid, sig): + raise ProcessLookupError + + monkeypatch.setattr("time.monotonic", fake_monotonic) + monkeypatch.setattr("time.sleep", lambda _: None) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99) + monkeypatch.setattr("os.kill", mock_kill) + + # Should not raise — ProcessLookupError means it's already gone. + gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)