Merge pull request #1902 from NousResearch/hermes/hermes-b29f73b2
fix(gateway): PID-based wait with force-kill for gateway restart
This commit is contained in:
@@ -849,6 +849,46 @@ def launchd_stop():
|
||||
subprocess.run(["launchctl", "stop", "ai.hermes.gateway"], check=True)
|
||||
print("✓ Service stopped")
|
||||
|
||||
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):
|
||||
"""Wait for the gateway process (by saved PID) to exit.
|
||||
|
||||
Uses the PID from the gateway.pid file — not launchd labels — so this
|
||||
works correctly when multiple gateway instances run under separate
|
||||
HERMES_HOME directories.
|
||||
|
||||
Args:
|
||||
timeout: Total seconds to wait before giving up.
|
||||
force_after: Seconds of graceful waiting before sending SIGKILL.
|
||||
"""
|
||||
import time
|
||||
from gateway.status import get_running_pid
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
force_deadline = time.monotonic() + force_after
|
||||
force_sent = False
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
pid = get_running_pid()
|
||||
if pid is None:
|
||||
return # Process exited cleanly.
|
||||
|
||||
if not force_sent and time.monotonic() >= force_deadline:
|
||||
# Grace period expired — force-kill the specific PID.
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f"⚠ Gateway PID {pid} did not exit gracefully; sent SIGKILL")
|
||||
except (ProcessLookupError, PermissionError):
|
||||
return # Already gone or we can't touch it.
|
||||
force_sent = True
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
# Timed out even after SIGKILL.
|
||||
remaining_pid = get_running_pid()
|
||||
if remaining_pid is not None:
|
||||
print(f"⚠ Gateway PID {remaining_pid} still running after {timeout}s — restart may fail")
|
||||
|
||||
|
||||
def launchd_restart():
|
||||
try:
|
||||
launchd_stop()
|
||||
@@ -856,6 +896,7 @@ def launchd_restart():
|
||||
if e.returncode != 3:
|
||||
raise
|
||||
print("↻ launchd job was unloaded; skipping stop")
|
||||
_wait_for_gateway_exit()
|
||||
launchd_start()
|
||||
|
||||
def launchd_status(deep: bool = False):
|
||||
@@ -1753,10 +1794,9 @@ def gateway_command(args):
|
||||
killed = kill_gateway_processes()
|
||||
if killed:
|
||||
print(f"✓ Stopped {killed} gateway process(es)")
|
||||
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)
|
||||
|
||||
# Start fresh
|
||||
print("Starting gateway...")
|
||||
run_gateway(verbose=False)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""Tests for hermes_cli.gateway."""
|
||||
|
||||
import signal
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import patch, call
|
||||
|
||||
import hermes_cli.gateway as gateway
|
||||
|
||||
@@ -169,3 +171,84 @@ def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeyp
|
||||
|
||||
assert (scope, did_install) == ("system", True)
|
||||
assert calls == [(True, True, "alice")]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _wait_for_gateway_exit
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestWaitForGatewayExit:
|
||||
"""PID-based wait with force-kill on timeout."""
|
||||
|
||||
def test_returns_immediately_when_no_pid(self, monkeypatch):
|
||||
"""If get_running_pid returns None, exit instantly."""
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
|
||||
# Should return without sleeping at all.
|
||||
gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5)
|
||||
|
||||
def test_returns_when_process_exits_gracefully(self, monkeypatch):
|
||||
"""Process exits after a couple of polls — no SIGKILL needed."""
|
||||
poll_count = 0
|
||||
|
||||
def mock_get_running_pid():
|
||||
nonlocal poll_count
|
||||
poll_count += 1
|
||||
return 12345 if poll_count <= 2 else None
|
||||
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0)
|
||||
# Should have polled until None was returned.
|
||||
assert poll_count == 3
|
||||
|
||||
def test_force_kills_after_grace_period(self, monkeypatch):
|
||||
"""When the process doesn't exit, SIGKILL the saved PID."""
|
||||
import time as _time
|
||||
|
||||
# Simulate monotonic time advancing past force_after
|
||||
call_num = 0
|
||||
def fake_monotonic():
|
||||
nonlocal call_num
|
||||
call_num += 1
|
||||
# First two calls: initial deadline + force_deadline setup (time 0)
|
||||
# Then each loop iteration advances time
|
||||
return call_num * 2.0 # 2, 4, 6, 8, ...
|
||||
|
||||
kills = []
|
||||
def mock_kill(pid, sig):
|
||||
kills.append((pid, sig))
|
||||
|
||||
# get_running_pid returns the PID until kill is sent, then None
|
||||
def mock_get_running_pid():
|
||||
return None if kills else 42
|
||||
|
||||
monkeypatch.setattr("time.monotonic", fake_monotonic)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
|
||||
monkeypatch.setattr("os.kill", mock_kill)
|
||||
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0)
|
||||
assert (42, signal.SIGKILL) in kills
|
||||
|
||||
def test_handles_process_already_gone_on_kill(self, monkeypatch):
|
||||
"""ProcessLookupError during SIGKILL is not fatal."""
|
||||
import time as _time
|
||||
|
||||
call_num = 0
|
||||
def fake_monotonic():
|
||||
nonlocal call_num
|
||||
call_num += 1
|
||||
return call_num * 3.0 # Jump past force_after quickly
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
raise ProcessLookupError
|
||||
|
||||
monkeypatch.setattr("time.monotonic", fake_monotonic)
|
||||
monkeypatch.setattr("time.sleep", lambda _: None)
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99)
|
||||
monkeypatch.setattr("os.kill", mock_kill)
|
||||
|
||||
# Should not raise — ProcessLookupError means it's already gone.
|
||||
gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)
|
||||
|
||||
Reference in New Issue
Block a user