diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 1c44e3113..a7876bc40 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -849,6 +849,46 @@ def launchd_stop(): subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True) print("✓ Service stopped") +def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0): + """Wait for the gateway process (by saved PID) to exit. + + Uses the PID from the gateway.pid file — not launchd labels — so this + works correctly when multiple gateway instances run under separate + HERMES_HOME directories. + + Args: + timeout: Total seconds to wait before giving up. + force_after: Seconds of graceful waiting before sending SIGKILL. + """ + import time + from gateway.status import get_running_pid + + deadline = time.monotonic() + timeout + force_deadline = time.monotonic() + force_after + force_sent = False + + while time.monotonic() < deadline: + pid = get_running_pid() + if pid is None: + return # Process exited cleanly. + + if not force_sent and time.monotonic() >= force_deadline: + # Grace period expired — force-kill the specific PID. + try: + os.kill(pid, signal.SIGKILL) + print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL") + except (ProcessLookupError, PermissionError): + return # Already gone or we can't touch it. + force_sent = True + + time.sleep(0.3) + + # Timed out even after SIGKILL. + remaining_pid = get_running_pid() + if remaining_pid is not None: + print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail") + + def launchd_restart(): try: launchd_stop() @@ -856,6 +896,7 @@ def launchd_restart(): if e.returncode != 3: raise print("↻ launchd job was unloaded; skipping stop") + _wait_for_gateway_exit() launchd_start() def launchd_status(deep: bool = False): @@ -1753,10 +1794,9 @@ def gateway_command(args): killed = kill_gateway_processes() if killed: print(f"✓ Stopped {killed} gateway process(es)") - - import time - time.sleep(2) - + + _wait_for_gateway_exit(timeout=10.0, force_after=5.0) + # Start fresh print("Starting gateway...") run_gateway(verbose=False) diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index 52d43fd08..b92f385e2 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -1,6 +1,8 @@ """Tests for hermes_cli.gateway.""" +import signal from types import SimpleNamespace +from unittest.mock import patch, call import hermes_cli.gateway as gateway @@ -169,3 +171,84 @@ def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeyp assert (scope, did_install) == ("system", True) assert calls == [(True, True, "alice")] + + +# --------------------------------------------------------------------------- +# _wait_for_gateway_exit +# --------------------------------------------------------------------------- + + +class TestWaitForGatewayExit: + """PID-based wait with force-kill on timeout.""" + + def test_returns_immediately_when_no_pid(self, monkeypatch): + """If get_running_pid returns None, exit instantly.""" + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + # Should return without sleeping at all. + gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5) + + def test_returns_when_process_exits_gracefully(self, monkeypatch): + """Process exits after a couple of polls — no SIGKILL needed.""" + poll_count = 0 + + def mock_get_running_pid(): + nonlocal poll_count + poll_count += 1 + return 12345 if poll_count <= 2 else None + + monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid) + monkeypatch.setattr("time.sleep", lambda _: None) + + gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0) + # Should have polled until None was returned. + assert poll_count == 3 + + def test_force_kills_after_grace_period(self, monkeypatch): + """When the process doesn't exit, SIGKILL the saved PID.""" + import time as _time + + # Simulate monotonic time advancing past force_after + call_num = 0 + def fake_monotonic(): + nonlocal call_num + call_num += 1 + # First two calls: initial deadline + force_deadline setup (time 0) + # Then each loop iteration advances time + return call_num * 2.0 # 2, 4, 6, 8, ... + + kills = [] + def mock_kill(pid, sig): + kills.append((pid, sig)) + + # get_running_pid returns the PID until kill is sent, then None + def mock_get_running_pid(): + return None if kills else 42 + + monkeypatch.setattr("time.monotonic", fake_monotonic) + monkeypatch.setattr("time.sleep", lambda _: None) + monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid) + monkeypatch.setattr("os.kill", mock_kill) + + gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0) + assert (42, signal.SIGKILL) in kills + + def test_handles_process_already_gone_on_kill(self, monkeypatch): + """ProcessLookupError during SIGKILL is not fatal.""" + import time as _time + + call_num = 0 + def fake_monotonic(): + nonlocal call_num + call_num += 1 + return call_num * 3.0 # Jump past force_after quickly + + def mock_kill(pid, sig): + raise ProcessLookupError + + monkeypatch.setattr("time.monotonic", fake_monotonic) + monkeypatch.setattr("time.sleep", lambda _: None) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99) + monkeypatch.setattr("os.kill", mock_kill) + + # Should not raise — ProcessLookupError means it's already gone. + gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)