Merge pull request #1902 from NousResearch/hermes/hermes-b29f73b2

fix(gateway): PID-based wait with force-kill for gateway restart
This commit is contained in:
Teknium
2026-03-18 02:54:38 -07:00
committed by GitHub
2 changed files with 127 additions and 4 deletions

View File

@@ -849,6 +849,46 @@ def launchd_stop():
subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True)
print("✓ Service stopped")
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):
"""Wait for the gateway process (by saved PID) to exit.
Uses the PID from the gateway.pid file — not launchd labels — so this
works correctly when multiple gateway instances run under separate
HERMES_HOME directories.
Args:
timeout: Total seconds to wait before giving up.
force_after: Seconds of graceful waiting before sending SIGKILL.
"""
import time
from gateway.status import get_running_pid
deadline = time.monotonic() + timeout
force_deadline = time.monotonic() + force_after
force_sent = False
while time.monotonic() < deadline:
pid = get_running_pid()
if pid is None:
return # Process exited cleanly.
if not force_sent and time.monotonic() >= force_deadline:
# Grace period expired — force-kill the specific PID.
try:
os.kill(pid, signal.SIGKILL)
print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL")
except (ProcessLookupError, PermissionError):
return # Already gone or we can't touch it.
force_sent = True
time.sleep(0.3)
# Timed out even after SIGKILL.
remaining_pid = get_running_pid()
if remaining_pid is not None:
print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail")
def launchd_restart():
try:
launchd_stop()
@@ -856,6 +896,7 @@ def launchd_restart():
if e.returncode != 3:
raise
print("↻ launchd job was unloaded; skipping stop")
_wait_for_gateway_exit()
launchd_start()
def launchd_status(deep: bool = False):
@@ -1753,10 +1794,9 @@ def gateway_command(args):
killed = kill_gateway_processes()
if killed:
print(f"✓ Stopped {killed} gateway process(es)")
import time
time.sleep(2)
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)
# Start fresh
print("Starting gateway...")
run_gateway(verbose=False)

View File

@@ -1,6 +1,8 @@
"""Tests for hermes_cli.gateway."""
import signal
from types import SimpleNamespace
from unittest.mock import patch, call
import hermes_cli.gateway as gateway
@@ -169,3 +171,84 @@ def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeyp
assert (scope, did_install) == ("system", True)
assert calls == [(True, True, "alice")]
# ---------------------------------------------------------------------------
# _wait_for_gateway_exit
# ---------------------------------------------------------------------------
class TestWaitForGatewayExit:
"""PID-based wait with force-kill on timeout."""
def test_returns_immediately_when_no_pid(self, monkeypatch):
"""If get_running_pid returns None, exit instantly."""
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
# Should return without sleeping at all.
gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5)
def test_returns_when_process_exits_gracefully(self, monkeypatch):
"""Process exits after a couple of polls — no SIGKILL needed."""
poll_count = 0
def mock_get_running_pid():
nonlocal poll_count
poll_count += 1
return 12345 if poll_count <= 2 else None
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
monkeypatch.setattr("time.sleep", lambda _: None)
gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0)
# Should have polled until None was returned.
assert poll_count == 3
def test_force_kills_after_grace_period(self, monkeypatch):
"""When the process doesn't exit, SIGKILL the saved PID."""
import time as _time
# Simulate monotonic time advancing past force_after
call_num = 0
def fake_monotonic():
nonlocal call_num
call_num += 1
# First two calls: initial deadline + force_deadline setup (time 0)
# Then each loop iteration advances time
return call_num * 2.0 # 2, 4, 6, 8, ...
kills = []
def mock_kill(pid, sig):
kills.append((pid, sig))
# get_running_pid returns the PID until kill is sent, then None
def mock_get_running_pid():
return None if kills else 42
monkeypatch.setattr("time.monotonic", fake_monotonic)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
monkeypatch.setattr("os.kill", mock_kill)
gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0)
assert (42, signal.SIGKILL) in kills
def test_handles_process_already_gone_on_kill(self, monkeypatch):
"""ProcessLookupError during SIGKILL is not fatal."""
import time as _time
call_num = 0
def fake_monotonic():
nonlocal call_num
call_num += 1
return call_num * 3.0 # Jump past force_after quickly
def mock_kill(pid, sig):
raise ProcessLookupError
monkeypatch.setattr("time.monotonic", fake_monotonic)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99)
monkeypatch.setattr("os.kill", mock_kill)
# Should not raise — ProcessLookupError means it's already gone.
gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)