fix: hermes update kills freshly-restarted gateway service

After restarting a service-managed gateway (systemd/launchd), the
stale-process sweep calls find_gateway_pids() which returns ALL gateway
PIDs via ps aux — including the one just spawned by the service manager.
The sweep kills it, leaving the user with a stopped gateway and a
confusing 'Restart manually' message.

Fix: add _get_service_pids() to query systemd MainPID and launchd PID
for active gateway services, then exclude those PIDs from the sweep.
Also add exclude_pids parameter to find_gateway_pids() and
kill_gateway_processes() so callers can skip known service-managed PIDs.

Adds 9 targeted tests covering:
- _get_service_pids() for systemd, launchd, empty, and zero-PID cases
- find_gateway_pids() exclude_pids filtering
- cmd_update integration: service PID not killed after restart
- cmd_update integration: manual PID killed while service PID preserved
This commit is contained in:
kshitijk4poor
2026-04-06 09:52:22 +05:30
committed by Teknium
parent 9c96f669a1
commit a2a9ad7431
3 changed files with 371 additions and 9 deletions

View File

@@ -28,9 +28,101 @@ from hermes_cli.colors import Colors, color
# Process Management (for manual gateway runs)
# =============================================================================
def find_gateway_pids() -> list:
"""Find PIDs of running gateway processes."""
def _get_service_pids() -> set:
"""Return PIDs currently managed by systemd or launchd gateway services.
Used to avoid killing freshly-restarted service processes when sweeping
for stale manual gateway processes after a service restart.
"""
pids: set = set()
# --- systemd (Linux) ---
if is_linux():
try:
result = subprocess.run(
["systemctl", "--user", "list-units", "hermes-gateway*",
"--plain", "--no-legend", "--no-pager"],
capture_output=True, text=True, timeout=5,
)
for line in result.stdout.strip().splitlines():
parts = line.split()
if not parts or not parts[0].endswith(".service"):
continue
svc = parts[0]
try:
show = subprocess.run(
["systemctl", "--user", "show", svc,
"--property=MainPID", "--value"],
capture_output=True, text=True, timeout=5,
)
pid = int(show.stdout.strip())
if pid > 0:
pids.add(pid)
except (ValueError, subprocess.TimeoutExpired):
pass
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# Also check system scope
try:
result = subprocess.run(
["systemctl", "list-units", "hermes-gateway*",
"--plain", "--no-legend", "--no-pager"],
capture_output=True, text=True, timeout=5,
)
for line in result.stdout.strip().splitlines():
parts = line.split()
if not parts or not parts[0].endswith(".service"):
continue
svc = parts[0]
try:
show = subprocess.run(
["systemctl", "show", svc,
"--property=MainPID", "--value"],
capture_output=True, text=True, timeout=5,
)
pid = int(show.stdout.strip())
if pid > 0:
pids.add(pid)
except (ValueError, subprocess.TimeoutExpired):
pass
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
# --- launchd (macOS) ---
if is_macos():
try:
from hermes_cli.gateway import get_launchd_label
result = subprocess.run(
["launchctl", "list", get_launchd_label()],
capture_output=True, text=True, timeout=5,
)
if result.returncode == 0:
# Output format: "PID\tStatus\tLabel" header then data line
for line in result.stdout.strip().splitlines():
parts = line.split()
if parts:
try:
pid = int(parts[0])
if pid > 0:
pids.add(pid)
except ValueError:
pass
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
return pids
def find_gateway_pids(exclude_pids: set | None = None) -> list:
"""Find PIDs of running gateway processes.
Args:
exclude_pids: PIDs to exclude from the result (e.g. service-managed
PIDs that should not be killed during a stale-process sweep).
"""
pids = []
_exclude = exclude_pids or set()
patterns = [
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
@@ -56,7 +148,7 @@ def find_gateway_pids() -> list:
if any(p in current_cmd for p in patterns):
try:
pid = int(pid_str)
if pid != os.getpid() and pid not in pids:
if pid != os.getpid() and pid not in pids and pid not in _exclude:
pids.append(pid)
except ValueError:
pass
@@ -78,7 +170,7 @@ def find_gateway_pids() -> list:
if len(parts) > 1:
try:
pid = int(parts[1])
if pid not in pids:
if pid not in pids and pid not in _exclude:
pids.append(pid)
except ValueError:
continue
@@ -89,9 +181,15 @@ def find_gateway_pids() -> list:
return pids
def kill_gateway_processes(force: bool = False) -> int:
"""Kill ALL running gateway processes (across all profiles). Returns count killed."""
pids = find_gateway_pids()
def kill_gateway_processes(force: bool = False, exclude_pids: set | None = None) -> int:
"""Kill any running gateway processes. Returns count killed.
Args:
force: Use SIGKILL instead of SIGTERM.
exclude_pids: PIDs to skip (e.g. service-managed PIDs that were just
restarted and should not be killed).
"""
pids = find_gateway_pids(exclude_pids=exclude_pids)
killed = 0
for pid in pids:

View File

@@ -3607,6 +3607,7 @@ def cmd_update(args):
from hermes_cli.gateway import (
is_macos, is_linux, _ensure_user_systemd_env,
get_systemd_linger_status, find_gateway_pids,
_get_service_pids,
)
import signal as _signal
@@ -3673,8 +3674,11 @@ def cmd_update(args):
pass
# --- Manual (non-service) gateways ---
# Kill any remaining gateway processes not managed by a service
manual_pids = find_gateway_pids()
# Kill any remaining gateway processes not managed by a service.
# Exclude PIDs that belong to just-restarted services so we don't
# immediately kill the process that systemd/launchd just spawned.
service_pids = _get_service_pids()
manual_pids = find_gateway_pids(exclude_pids=service_pids)
for pid in manual_pids:
try:
os.kill(pid, _signal.SIGTERM)

View File

@@ -491,3 +491,263 @@ class TestCmdUpdateSystemService:
captured = capsys.readouterr().out
# Both scopes are discovered and restarted
assert "Restarted hermes-gateway" in captured
# ---------------------------------------------------------------------------
# Service PID exclusion — the core bug fix
# ---------------------------------------------------------------------------
class TestServicePidExclusion:
"""After restarting a service, the stale-process sweep must NOT kill
the freshly-spawned service PID. This was the root cause of the bug
where ``hermes update`` would restart the gateway and immediately kill it.
"""
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_launchd_does_not_kill_service_pid(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
):
"""After launchd restart, the sweep must exclude the service PID."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist/>")
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
# The service PID that launchd manages after restart
SERVICE_PID = 42000
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
launchctl_loaded=True,
)
# Simulate find_gateway_pids returning the service PID (the bug scenario)
# and _get_service_pids returning the same PID to exclude it
with patch.object(
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
), patch.object(
gateway_cli, "find_gateway_pids",
side_effect=lambda exclude_pids=None: (
[SERVICE_PID] if not exclude_pids else
[p for p in [SERVICE_PID] if p not in exclude_pids]
),
), patch("os.kill") as mock_kill:
cmd_update(mock_args)
captured = capsys.readouterr().out
# Service was restarted
assert "Restarted" in captured
# The service PID should NOT have been killed by the manual sweep
kill_calls = [
c for c in mock_kill.call_args_list
if c.args[0] == SERVICE_PID
]
assert len(kill_calls) == 0, (
f"Service PID {SERVICE_PID} was killed by the manual sweep — "
f"this is the bug where update restarts then immediately kills the gateway"
)
# Should NOT show manual restart message
assert "Restart manually" not in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_systemd_does_not_kill_service_pid(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""After systemd restart, the sweep must exclude the service PID."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
SERVICE_PID = 55000
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
systemd_active=True,
)
with patch.object(
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
), patch.object(
gateway_cli, "find_gateway_pids",
side_effect=lambda exclude_pids=None: (
[SERVICE_PID] if not exclude_pids else
[p for p in [SERVICE_PID] if p not in exclude_pids]
),
), patch("os.kill") as mock_kill:
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "Restarted hermes-gateway" in captured
# Service PID must not be killed
kill_calls = [
c for c in mock_kill.call_args_list
if c.args[0] == SERVICE_PID
]
assert len(kill_calls) == 0
assert "Restart manually" not in captured
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_kills_manual_pid_but_not_service_pid(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
):
"""When both a service PID and a manual PID exist, only the manual one
is killed."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist/>")
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
SERVICE_PID = 42000
MANUAL_PID = 42999
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
launchctl_loaded=True,
)
def fake_find(exclude_pids=None):
_exclude = exclude_pids or set()
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
with patch.object(
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
), patch.object(
gateway_cli, "find_gateway_pids", side_effect=fake_find,
), patch("os.kill") as mock_kill:
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "Restarted" in captured
# Manual PID should be killed
manual_kills = [c for c in mock_kill.call_args_list if c.args[0] == MANUAL_PID]
assert len(manual_kills) == 1
# Service PID should NOT be killed
service_kills = [c for c in mock_kill.call_args_list if c.args[0] == SERVICE_PID]
assert len(service_kills) == 0
# Should show manual stop message since manual PID was killed
assert "Stopped 1 manual gateway" in captured
class TestGetServicePids:
"""Unit tests for _get_service_pids()."""
def test_returns_systemd_main_pid(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
def fake_run(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "list-units" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
stderr="",
)
if "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="12345\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
pids = gateway_cli._get_service_pids()
assert 12345 in pids
def test_returns_launchd_pid(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
def fake_run(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "launchctl" in joined and "list" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="PID\tStatus\tLabel\n67890\t0\tai.hermes.gateway\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
pids = gateway_cli._get_service_pids()
assert 67890 in pids
def test_returns_empty_when_no_services(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
pids = gateway_cli._get_service_pids()
assert pids == set()
def test_excludes_zero_pid(self, monkeypatch):
"""systemd returns MainPID=0 for stopped services; skip those."""
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
def fake_run(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "list-units" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded inactive dead Hermes Gateway\n",
stderr="",
)
if "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="0\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
pids = gateway_cli._get_service_pids()
assert 0 not in pids
assert pids == set()
class TestFindGatewayPidsExclude:
"""find_gateway_pids respects exclude_pids."""
def test_excludes_specified_pids(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
def fake_run(cmd, **kwargs):
return subprocess.CompletedProcess(
cmd, 0,
stdout=(
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
),
stderr="",
)
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr("os.getpid", lambda: 999)
pids = gateway_cli.find_gateway_pids(exclude_pids={100})
assert 100 not in pids
assert 200 in pids
def test_no_exclude_returns_all(self, monkeypatch):
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
def fake_run(cmd, **kwargs):
return subprocess.CompletedProcess(
cmd, 0,
stdout=(
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
),
stderr="",
)
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr("os.getpid", lambda: 999)
pids = gateway_cli.find_gateway_pids()
assert 100 in pids
assert 200 in pids