fix: hermes update kills freshly-restarted gateway service
After restarting a service-managed gateway (systemd/launchd), the stale-process sweep calls find_gateway_pids() which returns ALL gateway PIDs via ps aux — including the one just spawned by the service manager. The sweep kills it, leaving the user with a stopped gateway and a confusing 'Restart manually' message. Fix: add _get_service_pids() to query systemd MainPID and launchd PID for active gateway services, then exclude those PIDs from the sweep. Also add exclude_pids parameter to find_gateway_pids() and kill_gateway_processes() so callers can skip known service-managed PIDs. Adds 9 targeted tests covering: - _get_service_pids() for systemd, launchd, empty, and zero-PID cases - find_gateway_pids() exclude_pids filtering - cmd_update integration: service PID not killed after restart - cmd_update integration: manual PID killed while service PID preserved
This commit is contained in:
@@ -28,9 +28,101 @@ from hermes_cli.colors import Colors, color
|
||||
# Process Management (for manual gateway runs)
|
||||
# =============================================================================
|
||||
|
||||
def find_gateway_pids() -> list:
|
||||
"""Find PIDs of running gateway processes."""
|
||||
def _get_service_pids() -> set:
|
||||
"""Return PIDs currently managed by systemd or launchd gateway services.
|
||||
|
||||
Used to avoid killing freshly-restarted service processes when sweeping
|
||||
for stale manual gateway processes after a service restart.
|
||||
"""
|
||||
pids: set = set()
|
||||
|
||||
# --- systemd (Linux) ---
|
||||
if is_linux():
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["systemctl", "--user", "list-units", "hermes-gateway*",
|
||||
"--plain", "--no-legend", "--no-pager"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.strip().splitlines():
|
||||
parts = line.split()
|
||||
if not parts or not parts[0].endswith(".service"):
|
||||
continue
|
||||
svc = parts[0]
|
||||
try:
|
||||
show = subprocess.run(
|
||||
["systemctl", "--user", "show", svc,
|
||||
"--property=MainPID", "--value"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
pid = int(show.stdout.strip())
|
||||
if pid > 0:
|
||||
pids.add(pid)
|
||||
except (ValueError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
# Also check system scope
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["systemctl", "list-units", "hermes-gateway*",
|
||||
"--plain", "--no-legend", "--no-pager"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.strip().splitlines():
|
||||
parts = line.split()
|
||||
if not parts or not parts[0].endswith(".service"):
|
||||
continue
|
||||
svc = parts[0]
|
||||
try:
|
||||
show = subprocess.run(
|
||||
["systemctl", "show", svc,
|
||||
"--property=MainPID", "--value"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
pid = int(show.stdout.strip())
|
||||
if pid > 0:
|
||||
pids.add(pid)
|
||||
except (ValueError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
# --- launchd (macOS) ---
|
||||
if is_macos():
|
||||
try:
|
||||
from hermes_cli.gateway import get_launchd_label
|
||||
result = subprocess.run(
|
||||
["launchctl", "list", get_launchd_label()],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
# Output format: "PID\tStatus\tLabel" header then data line
|
||||
for line in result.stdout.strip().splitlines():
|
||||
parts = line.split()
|
||||
if parts:
|
||||
try:
|
||||
pid = int(parts[0])
|
||||
if pid > 0:
|
||||
pids.add(pid)
|
||||
except ValueError:
|
||||
pass
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
|
||||
return pids
|
||||
|
||||
|
||||
def find_gateway_pids(exclude_pids: set | None = None) -> list:
|
||||
"""Find PIDs of running gateway processes.
|
||||
|
||||
Args:
|
||||
exclude_pids: PIDs to exclude from the result (e.g. service-managed
|
||||
PIDs that should not be killed during a stale-process sweep).
|
||||
"""
|
||||
pids = []
|
||||
_exclude = exclude_pids or set()
|
||||
patterns = [
|
||||
"hermes_cli.main gateway",
|
||||
"hermes_cli/main.py gateway",
|
||||
@@ -56,7 +148,7 @@ def find_gateway_pids() -> list:
|
||||
if any(p in current_cmd for p in patterns):
|
||||
try:
|
||||
pid = int(pid_str)
|
||||
if pid != os.getpid() and pid not in pids:
|
||||
if pid != os.getpid() and pid not in pids and pid not in _exclude:
|
||||
pids.append(pid)
|
||||
except ValueError:
|
||||
pass
|
||||
@@ -78,7 +170,7 @@ def find_gateway_pids() -> list:
|
||||
if len(parts) > 1:
|
||||
try:
|
||||
pid = int(parts[1])
|
||||
if pid not in pids:
|
||||
if pid not in pids and pid not in _exclude:
|
||||
pids.append(pid)
|
||||
except ValueError:
|
||||
continue
|
||||
@@ -89,9 +181,15 @@ def find_gateway_pids() -> list:
|
||||
return pids
|
||||
|
||||
|
||||
def kill_gateway_processes(force: bool = False) -> int:
|
||||
"""Kill ALL running gateway processes (across all profiles). Returns count killed."""
|
||||
pids = find_gateway_pids()
|
||||
def kill_gateway_processes(force: bool = False, exclude_pids: set | None = None) -> int:
|
||||
"""Kill any running gateway processes. Returns count killed.
|
||||
|
||||
Args:
|
||||
force: Use SIGKILL instead of SIGTERM.
|
||||
exclude_pids: PIDs to skip (e.g. service-managed PIDs that were just
|
||||
restarted and should not be killed).
|
||||
"""
|
||||
pids = find_gateway_pids(exclude_pids=exclude_pids)
|
||||
killed = 0
|
||||
|
||||
for pid in pids:
|
||||
|
||||
@@ -3607,6 +3607,7 @@ def cmd_update(args):
|
||||
from hermes_cli.gateway import (
|
||||
is_macos, is_linux, _ensure_user_systemd_env,
|
||||
get_systemd_linger_status, find_gateway_pids,
|
||||
_get_service_pids,
|
||||
)
|
||||
import signal as _signal
|
||||
|
||||
@@ -3673,8 +3674,11 @@ def cmd_update(args):
|
||||
pass
|
||||
|
||||
# --- Manual (non-service) gateways ---
|
||||
# Kill any remaining gateway processes not managed by a service
|
||||
manual_pids = find_gateway_pids()
|
||||
# Kill any remaining gateway processes not managed by a service.
|
||||
# Exclude PIDs that belong to just-restarted services so we don't
|
||||
# immediately kill the process that systemd/launchd just spawned.
|
||||
service_pids = _get_service_pids()
|
||||
manual_pids = find_gateway_pids(exclude_pids=service_pids)
|
||||
for pid in manual_pids:
|
||||
try:
|
||||
os.kill(pid, _signal.SIGTERM)
|
||||
|
||||
@@ -491,3 +491,263 @@ class TestCmdUpdateSystemService:
|
||||
captured = capsys.readouterr().out
|
||||
# Both scopes are discovered and restarted
|
||||
assert "Restarted hermes-gateway" in captured
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Service PID exclusion — the core bug fix
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestServicePidExclusion:
|
||||
"""After restarting a service, the stale-process sweep must NOT kill
|
||||
the freshly-spawned service PID. This was the root cause of the bug
|
||||
where ``hermes update`` would restart the gateway and immediately kill it.
|
||||
"""
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_launchd_does_not_kill_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
|
||||
):
|
||||
"""After launchd restart, the sweep must exclude the service PID."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist/>")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
|
||||
# The service PID that launchd manages after restart
|
||||
SERVICE_PID = 42000
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
# Simulate find_gateway_pids returning the service PID (the bug scenario)
|
||||
# and _get_service_pids returning the same PID to exclude it
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids",
|
||||
side_effect=lambda exclude_pids=None: (
|
||||
[SERVICE_PID] if not exclude_pids else
|
||||
[p for p in [SERVICE_PID] if p not in exclude_pids]
|
||||
),
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
# Service was restarted
|
||||
assert "Restarted" in captured
|
||||
# The service PID should NOT have been killed by the manual sweep
|
||||
kill_calls = [
|
||||
c for c in mock_kill.call_args_list
|
||||
if c.args[0] == SERVICE_PID
|
||||
]
|
||||
assert len(kill_calls) == 0, (
|
||||
f"Service PID {SERVICE_PID} was killed by the manual sweep — "
|
||||
f"this is the bug where update restarts then immediately kills the gateway"
|
||||
)
|
||||
# Should NOT show manual restart message
|
||||
assert "Restart manually" not in captured
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_systemd_does_not_kill_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||
):
|
||||
"""After systemd restart, the sweep must exclude the service PID."""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
|
||||
SERVICE_PID = 55000
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
systemd_active=True,
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids",
|
||||
side_effect=lambda exclude_pids=None: (
|
||||
[SERVICE_PID] if not exclude_pids else
|
||||
[p for p in [SERVICE_PID] if p not in exclude_pids]
|
||||
),
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
assert "Restarted hermes-gateway" in captured
|
||||
# Service PID must not be killed
|
||||
kill_calls = [
|
||||
c for c in mock_kill.call_args_list
|
||||
if c.args[0] == SERVICE_PID
|
||||
]
|
||||
assert len(kill_calls) == 0
|
||||
assert "Restart manually" not in captured
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_kills_manual_pid_but_not_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
|
||||
):
|
||||
"""When both a service PID and a manual PID exist, only the manual one
|
||||
is killed."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist/>")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
|
||||
SERVICE_PID = 42000
|
||||
MANUAL_PID = 42999
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
def fake_find(exclude_pids=None):
|
||||
_exclude = exclude_pids or set()
|
||||
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids", side_effect=fake_find,
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
assert "Restarted" in captured
|
||||
# Manual PID should be killed
|
||||
manual_kills = [c for c in mock_kill.call_args_list if c.args[0] == MANUAL_PID]
|
||||
assert len(manual_kills) == 1
|
||||
# Service PID should NOT be killed
|
||||
service_kills = [c for c in mock_kill.call_args_list if c.args[0] == SERVICE_PID]
|
||||
assert len(service_kills) == 0
|
||||
# Should show manual stop message since manual PID was killed
|
||||
assert "Stopped 1 manual gateway" in captured
|
||||
|
||||
|
||||
class TestGetServicePids:
|
||||
"""Unit tests for _get_service_pids()."""
|
||||
|
||||
def test_returns_systemd_main_pid(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "list-units" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
if "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="12345\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 12345 in pids
|
||||
|
||||
def test_returns_launchd_pid(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "launchctl" in joined and "list" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="PID\tStatus\tLabel\n67890\t0\tai.hermes.gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 67890 in pids
|
||||
|
||||
def test_returns_empty_when_no_services(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert pids == set()
|
||||
|
||||
def test_excludes_zero_pid(self, monkeypatch):
|
||||
"""systemd returns MainPID=0 for stopped services; skip those."""
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "list-units" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded inactive dead Hermes Gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
if "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="0\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 0 not in pids
|
||||
assert pids == set()
|
||||
|
||||
|
||||
class TestFindGatewayPidsExclude:
|
||||
"""find_gateway_pids respects exclude_pids."""
|
||||
|
||||
def test_excludes_specified_pids(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout=(
|
||||
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
),
|
||||
stderr="",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr("os.getpid", lambda: 999)
|
||||
|
||||
pids = gateway_cli.find_gateway_pids(exclude_pids={100})
|
||||
assert 100 not in pids
|
||||
assert 200 in pids
|
||||
|
||||
def test_no_exclude_returns_all(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout=(
|
||||
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
),
|
||||
stderr="",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr("os.getpid", lambda: 999)
|
||||
|
||||
pids = gateway_cli.find_gateway_pids()
|
||||
assert 100 in pids
|
||||
assert 200 in pids
|
||||
|
||||
Reference in New Issue
Block a user