Files
hermes-agent/tests/hermes_cli/test_gateway.py
Test ace2cc6257 fix(gateway): PID-based wait with force-kill for gateway restart
Add _wait_for_gateway_exit() that polls get_running_pid() to confirm
the old gateway process has actually exited before starting a new one.
If the process doesn't exit within 5s, sends SIGKILL to the specific
PID. Uses the saved PID from gateway.pid (not launchd labels) so it
works correctly with multiple gateway instances under separate
HERMES_HOME directories.

Applied to both launchd_restart() and the manual restart path (replaces
the blind time.sleep(2)).

Inspired by PR #1881 by @AzothZephyr (race condition diagnosis).
Adds 4 tests.
2026-03-18 02:54:18 -07:00

255 lines
9.7 KiB
Python

"""Tests for hermes_cli.gateway."""
import signal
from types import SimpleNamespace
from unittest.mock import patch, call
import hermes_cli.gateway as gateway
class TestSystemdLingerStatus:
def test_reports_enabled(self, monkeypatch):
monkeypatch.setattr(gateway, "is_linux", lambda: True)
monkeypatch.setenv("USER", "alice")
monkeypatch.setattr(
gateway.subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(returncode=0, stdout="yes\n", stderr=""),
)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/loginctl")
assert gateway.get_systemd_linger_status() == (True, "")
def test_reports_disabled(self, monkeypatch):
monkeypatch.setattr(gateway, "is_linux", lambda: True)
monkeypatch.setenv("USER", "alice")
monkeypatch.setattr(
gateway.subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(returncode=0, stdout="no\n", stderr=""),
)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/loginctl")
assert gateway.get_systemd_linger_status() == (False, "")
def test_systemd_status_warns_when_linger_disabled(monkeypatch, tmp_path, capsys):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("[Unit]\n")
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
monkeypatch.setattr(gateway, "get_systemd_linger_status", lambda: (False, ""))
def fake_run(cmd, capture_output=False, text=False, check=False):
if cmd[:4] == ["systemctl", "--user", "status", gateway.get_service_name()]:
return SimpleNamespace(returncode=0, stdout="", stderr="")
if cmd[:3] == ["systemctl", "--user", "is-active"]:
return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
raise AssertionError(f"Unexpected command: {cmd}")
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
gateway.systemd_status(deep=False)
out = capsys.readouterr().out
assert "gateway service is running" in out
assert "Systemd linger is disabled" in out
assert "loginctl enable-linger" in out
def test_systemd_install_checks_linger_status(monkeypatch, tmp_path, capsys):
unit_path = tmp_path / "systemd" / "user" / "hermes-gateway.service"
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
calls = []
helper_calls = []
def fake_run(cmd, check=False, **kwargs):
calls.append((cmd, check))
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
monkeypatch.setattr(gateway, "_ensure_linger_enabled", lambda: helper_calls.append(True))
gateway.systemd_install(force=False)
out = capsys.readouterr().out
assert unit_path.exists()
assert [cmd for cmd, _ in calls] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "enable", gateway.get_service_name()],
]
assert helper_calls == [True]
assert "User service installed and enabled" in out
def test_systemd_install_system_scope_skips_linger_and_uses_systemctl(monkeypatch, tmp_path, capsys):
unit_path = tmp_path / "etc" / "systemd" / "system" / "hermes-gateway.service"
monkeypatch.setattr(gateway, "get_systemd_unit_path", lambda system=False: unit_path)
monkeypatch.setattr(
gateway,
"generate_systemd_unit",
lambda system=False, run_as_user=None: f"scope={system} user={run_as_user}\n",
)
monkeypatch.setattr(gateway, "_require_root_for_system_service", lambda action: None)
calls = []
helper_calls = []
def fake_run(cmd, check=False, **kwargs):
calls.append((cmd, check))
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
monkeypatch.setattr(gateway, "_ensure_linger_enabled", lambda: helper_calls.append(True))
gateway.systemd_install(force=False, system=True, run_as_user="alice")
out = capsys.readouterr().out
assert unit_path.exists()
assert unit_path.read_text(encoding="utf-8") == "scope=True user=alice\n"
assert [cmd for cmd, _ in calls] == [
["systemctl", "daemon-reload"],
["systemctl", "enable", gateway.get_service_name()],
]
assert helper_calls == []
assert "Configured to run as: alice" not in out # generated test unit has no User= line
assert "System service installed and enabled" in out
def test_conflicting_systemd_units_warning(monkeypatch, tmp_path, capsys):
user_unit = tmp_path / "user" / "hermes-gateway.service"
system_unit = tmp_path / "system" / "hermes-gateway.service"
user_unit.parent.mkdir(parents=True)
system_unit.parent.mkdir(parents=True)
user_unit.write_text("[Unit]\n", encoding="utf-8")
system_unit.write_text("[Unit]\n", encoding="utf-8")
monkeypatch.setattr(
gateway,
"get_systemd_unit_path",
lambda system=False: system_unit if system else user_unit,
)
gateway.print_systemd_scope_conflict_warning()
out = capsys.readouterr().out
assert "Both user and system gateway services are installed" in out
assert "hermes gateway uninstall" in out
assert "--system" in out
def test_install_linux_gateway_from_setup_system_choice_without_root_prints_followup(monkeypatch, capsys):
monkeypatch.setattr(gateway, "prompt_linux_gateway_install_scope", lambda: "system")
monkeypatch.setattr(gateway.os, "geteuid", lambda: 1000)
monkeypatch.setattr(gateway, "_default_system_service_user", lambda: "alice")
monkeypatch.setattr(gateway, "systemd_install", lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("should not install")))
scope, did_install = gateway.install_linux_gateway_from_setup(force=False)
out = capsys.readouterr().out
assert (scope, did_install) == ("system", False)
assert "sudo hermes gateway install --system --run-as-user alice" in out
assert "sudo hermes gateway start --system" in out
def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeypatch):
monkeypatch.setattr(gateway, "prompt_linux_gateway_install_scope", lambda: "system")
monkeypatch.setattr(gateway.os, "geteuid", lambda: 0)
monkeypatch.setattr(gateway, "_default_system_service_user", lambda: "alice")
calls = []
monkeypatch.setattr(
gateway,
"systemd_install",
lambda force=False, system=False, run_as_user=None: calls.append((force, system, run_as_user)),
)
scope, did_install = gateway.install_linux_gateway_from_setup(force=True)
assert (scope, did_install) == ("system", True)
assert calls == [(True, True, "alice")]
# ---------------------------------------------------------------------------
# _wait_for_gateway_exit
# ---------------------------------------------------------------------------
class TestWaitForGatewayExit:
"""PID-based wait with force-kill on timeout."""
def test_returns_immediately_when_no_pid(self, monkeypatch):
"""If get_running_pid returns None, exit instantly."""
monkeypatch.setattr("gateway.status.get_running_pid", lambda: None)
# Should return without sleeping at all.
gateway._wait_for_gateway_exit(timeout=1.0, force_after=0.5)
def test_returns_when_process_exits_gracefully(self, monkeypatch):
"""Process exits after a couple of polls — no SIGKILL needed."""
poll_count = 0
def mock_get_running_pid():
nonlocal poll_count
poll_count += 1
return 12345 if poll_count <= 2 else None
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
monkeypatch.setattr("time.sleep", lambda _: None)
gateway._wait_for_gateway_exit(timeout=10.0, force_after=999.0)
# Should have polled until None was returned.
assert poll_count == 3
def test_force_kills_after_grace_period(self, monkeypatch):
"""When the process doesn't exit, SIGKILL the saved PID."""
import time as _time
# Simulate monotonic time advancing past force_after
call_num = 0
def fake_monotonic():
nonlocal call_num
call_num += 1
# First two calls: initial deadline + force_deadline setup (time 0)
# Then each loop iteration advances time
return call_num * 2.0 # 2, 4, 6, 8, ...
kills = []
def mock_kill(pid, sig):
kills.append((pid, sig))
# get_running_pid returns the PID until kill is sent, then None
def mock_get_running_pid():
return None if kills else 42
monkeypatch.setattr("time.monotonic", fake_monotonic)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("gateway.status.get_running_pid", mock_get_running_pid)
monkeypatch.setattr("os.kill", mock_kill)
gateway._wait_for_gateway_exit(timeout=10.0, force_after=5.0)
assert (42, signal.SIGKILL) in kills
def test_handles_process_already_gone_on_kill(self, monkeypatch):
"""ProcessLookupError during SIGKILL is not fatal."""
import time as _time
call_num = 0
def fake_monotonic():
nonlocal call_num
call_num += 1
return call_num * 3.0 # Jump past force_after quickly
def mock_kill(pid, sig):
raise ProcessLookupError
monkeypatch.setattr("time.monotonic", fake_monotonic)
monkeypatch.setattr("time.sleep", lambda _: None)
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 99)
monkeypatch.setattr("os.kill", mock_kill)
# Should not raise — ProcessLookupError means it's already gone.
gateway._wait_for_gateway_exit(timeout=10.0, force_after=2.0)