fix(gateway): PID-based wait with force-kill for gateway restart
Add _wait_for_gateway_exit() that polls get_running_pid() to confirm the old gateway process has actually exited before starting a new one. If the process doesn't exit within 5s, sends SIGKILL to the specific PID. Uses the saved PID from gateway.pid (not launchd labels) so it works correctly with multiple gateway instances under separate HERMES_HOME directories. Applied to both launchd_restart() and the manual restart path (replaces the blind time.sleep(2)). Inspired by PR #1881 by @AzothZephyr (race condition diagnosis). Adds 4 tests.
This commit is contained in:
@@ -849,6 +849,46 @@ def launchd_stop():
|
||||
subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True)
|
||||
print("✓ Service stopped")
|
||||
|
||||
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):
|
||||
"""Wait for the gateway process (by saved PID) to exit.
|
||||
|
||||
Uses the PID from the gateway.pid file — not launchd labels — so this
|
||||
works correctly when multiple gateway instances run under separate
|
||||
HERMES_HOME directories.
|
||||
|
||||
Args:
|
||||
timeout: Total seconds to wait before giving up.
|
||||
force_after: Seconds of graceful waiting before sending SIGKILL.
|
||||
"""
|
||||
import time
|
||||
from gateway.status import get_running_pid
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
force_deadline = time.monotonic() + force_after
|
||||
force_sent = False
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
pid = get_running_pid()
|
||||
if pid is None:
|
||||
return # Process exited cleanly.
|
||||
|
||||
if not force_sent and time.monotonic() >= force_deadline:
|
||||
# Grace period expired — force-kill the specific PID.
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL")
|
||||
except (ProcessLookupError, PermissionError):
|
||||
return # Already gone or we can't touch it.
|
||||
force_sent = True
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# Timed out even after SIGKILL.
|
||||
remaining_pid = get_running_pid()
|
||||
if remaining_pid is not None:
|
||||
print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail")
|
||||
|
||||
|
||||
def launchd_restart():
|
||||
try:
|
||||
launchd_stop()
|
||||
@@ -856,6 +896,7 @@ def launchd_restart():
|
||||
if e.returncode != 3:
|
||||
raise
|
||||
print("↻ launchd job was unloaded; skipping stop")
|
||||
_wait_for_gateway_exit()
|
||||
launchd_start()
|
||||
|
||||
def launchd_status(deep: bool = False):
|
||||
@@ -1753,10 +1794,9 @@ def gateway_command(args):
|
||||
killed = kill_gateway_processes()
|
||||
if killed:
|
||||
print(f"✓ Stopped {killed} gateway process(es)")
|
||||
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)
|
||||
|
||||
# Start fresh
|
||||
print("Starting gateway...")
|
||||
run_gateway(verbose=False)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""Tests for hermes_cli.gateway."""
|
||||
|
||||
import signal
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import patch, call
|
||||
|
||||
import hermes_cli.gateway as gateway
|
||||
|
||||
@@ -169,3 +171,84 @@ def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeyp
|
||||
|
||||
assert (scope, did_install) == ("system", True)
|
||||
assert calls == [(True, True, "alice")]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _wait_for_gateway_exit
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestWaitForGatewayExit:
|
||||
"""PID-based wait with force-kill on timeout."""
|
||||
|
||||
def test_returns_immediately_when_no_pid(self, monkeypatch):
|
||||
"""If get_running_pid returns None, exit instantly."""
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
|
||||
# Should return without sleeping at all.
|
||||
gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5)
|
||||
|
||||
def test_returns_when_process_exits_gracefully(self, monkeypatch):
|
||||
"""Process exits after a couple of polls — no SIGKILL needed."""
|
||||
poll_count = 0
|
||||
|
||||
def mock_get_running_pid():
|
||||
nonlocal poll_count
|
||||
poll_count += 1
|
||||
return 12345 if poll_count <= 2 else None
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0)
|
||||
# Should have polled until None was returned.
|
||||
assert poll_count == 3
|
||||
|
||||
def test_force_kills_after_grace_period(self, monkeypatch):
|
||||
"""When the process doesn't exit, SIGKILL the saved PID."""
|
||||
import time as _time
|
||||
|
||||
# Simulate monotonic time advancing past force_after
|
||||
call_num = 0
|
||||
def fake_monotonic():
|
||||
nonlocal call_num
|
||||
call_num += 1
|
||||
# First two calls: initial deadline + force_deadline setup (time 0)
|
||||
# Then each loop iteration advances time
|
||||
return call_num * 2.0 # 2, 4, 6, 8, ...
|
||||
|
||||
kills = []
|
||||
def mock_kill(pid, sig):
|
||||
kills.append((pid, sig))
|
||||
|
||||
# get_running_pid returns the PID until kill is sent, then None
|
||||
def mock_get_running_pid():
|
||||
return None if kills else 42
|
||||
|
||||
monkeypatch.setattr("time.monotonic", fake_monotonic)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
|
||||
monkeypatch.setattr("os.kill", mock_kill)
|
||||
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0)
|
||||
assert (42, signal.SIGKILL) in kills
|
||||
|
||||
def test_handles_process_already_gone_on_kill(self, monkeypatch):
|
||||
"""ProcessLookupError during SIGKILL is not fatal."""
|
||||
import time as _time
|
||||
|
||||
call_num = 0
|
||||
def fake_monotonic():
|
||||
nonlocal call_num
|
||||
call_num += 1
|
||||
return call_num * 3.0 # Jump past force_after quickly
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
raise ProcessLookupError
|
||||
|
||||
monkeypatch.setattr("time.monotonic", fake_monotonic)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99)
|
||||
monkeypatch.setattr("os.kill", mock_kill)
|
||||
|
||||
# Should not raise — ProcessLookupError means it's already gone.
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)
|
||||
|
||||
Reference in New Issue
Block a user