diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index 93f3a9358..f328d03b7 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -28,9 +28,101 @@ from hermes_cli.colors import Colors, color
# Process Management (for manual gateway runs)
# =============================================================================
-def find_gateway_pids() -> list:
- """Find PIDs of running gateway processes."""
+def _get_service_pids() -> set:
+ """Return PIDs currently managed by systemd or launchd gateway services.
+
+ Used to avoid killing freshly-restarted service processes when sweeping
+ for stale manual gateway processes after a service restart.
+ """
+ pids: set = set()
+
+ # --- systemd (Linux) ---
+ if is_linux():
+ try:
+ result = subprocess.run(
+ ["systemctl", "--user", "list-units", "hermes-gateway*",
+ "--plain", "--no-legend", "--no-pager"],
+ capture_output=True, text=True, timeout=5,
+ )
+ for line in result.stdout.strip().splitlines():
+ parts = line.split()
+ if not parts or not parts[0].endswith(".service"):
+ continue
+ svc = parts[0]
+ try:
+ show = subprocess.run(
+ ["systemctl", "--user", "show", svc,
+ "--property=MainPID", "--value"],
+ capture_output=True, text=True, timeout=5,
+ )
+ pid = int(show.stdout.strip())
+ if pid > 0:
+ pids.add(pid)
+ except (ValueError, subprocess.TimeoutExpired):
+ pass
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ pass
+
+ # Also check system scope
+ try:
+ result = subprocess.run(
+ ["systemctl", "list-units", "hermes-gateway*",
+ "--plain", "--no-legend", "--no-pager"],
+ capture_output=True, text=True, timeout=5,
+ )
+ for line in result.stdout.strip().splitlines():
+ parts = line.split()
+ if not parts or not parts[0].endswith(".service"):
+ continue
+ svc = parts[0]
+ try:
+ show = subprocess.run(
+ ["systemctl", "show", svc,
+ "--property=MainPID", "--value"],
+ capture_output=True, text=True, timeout=5,
+ )
+ pid = int(show.stdout.strip())
+ if pid > 0:
+ pids.add(pid)
+ except (ValueError, subprocess.TimeoutExpired):
+ pass
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ pass
+
+ # --- launchd (macOS) ---
+ if is_macos():
+ try:
+ from hermes_cli.gateway import get_launchd_label
+ result = subprocess.run(
+ ["launchctl", "list", get_launchd_label()],
+ capture_output=True, text=True, timeout=5,
+ )
+ if result.returncode == 0:
+ # Output format: "PID\tStatus\tLabel" header then data line
+ for line in result.stdout.strip().splitlines():
+ parts = line.split()
+ if parts:
+ try:
+ pid = int(parts[0])
+ if pid > 0:
+ pids.add(pid)
+ except ValueError:
+ pass
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ pass
+
+ return pids
+
+
+def find_gateway_pids(exclude_pids: set | None = None) -> list:
+ """Find PIDs of running gateway processes.
+
+ Args:
+ exclude_pids: PIDs to exclude from the result (e.g. service-managed
+ PIDs that should not be killed during a stale-process sweep).
+ """
pids = []
+ _exclude = exclude_pids or set()
patterns = [
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
@@ -56,7 +148,7 @@ def find_gateway_pids() -> list:
if any(p in current_cmd for p in patterns):
try:
pid = int(pid_str)
- if pid != os.getpid() and pid not in pids:
+ if pid != os.getpid() and pid not in pids and pid not in _exclude:
pids.append(pid)
except ValueError:
pass
@@ -78,7 +170,7 @@ def find_gateway_pids() -> list:
if len(parts) > 1:
try:
pid = int(parts[1])
- if pid not in pids:
+ if pid not in pids and pid not in _exclude:
pids.append(pid)
except ValueError:
continue
@@ -89,9 +181,15 @@ def find_gateway_pids() -> list:
return pids
-def kill_gateway_processes(force: bool = False) -> int:
- """Kill ALL running gateway processes (across all profiles). Returns count killed."""
- pids = find_gateway_pids()
+def kill_gateway_processes(force: bool = False, exclude_pids: set | None = None) -> int:
+ """Kill any running gateway processes. Returns count killed.
+
+ Args:
+ force: Use SIGKILL instead of SIGTERM.
+ exclude_pids: PIDs to skip (e.g. service-managed PIDs that were just
+ restarted and should not be killed).
+ """
+ pids = find_gateway_pids(exclude_pids=exclude_pids)
killed = 0
for pid in pids:
diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 5994e5cea..ad5d5b036 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -3607,6 +3607,7 @@ def cmd_update(args):
from hermes_cli.gateway import (
is_macos, is_linux, _ensure_user_systemd_env,
get_systemd_linger_status, find_gateway_pids,
+ _get_service_pids,
)
import signal as _signal
@@ -3673,8 +3674,11 @@ def cmd_update(args):
pass
# --- Manual (non-service) gateways ---
- # Kill any remaining gateway processes not managed by a service
- manual_pids = find_gateway_pids()
+ # Kill any remaining gateway processes not managed by a service.
+ # Exclude PIDs that belong to just-restarted services so we don't
+ # immediately kill the process that systemd/launchd just spawned.
+ service_pids = _get_service_pids()
+ manual_pids = find_gateway_pids(exclude_pids=service_pids)
for pid in manual_pids:
try:
os.kill(pid, _signal.SIGTERM)
diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py
index ca25c05a7..d716cfb50 100644
--- a/tests/hermes_cli/test_update_gateway_restart.py
+++ b/tests/hermes_cli/test_update_gateway_restart.py
@@ -491,3 +491,263 @@ class TestCmdUpdateSystemService:
captured = capsys.readouterr().out
# Both scopes are discovered and restarted
assert "Restarted hermes-gateway" in captured
+
+
+# ---------------------------------------------------------------------------
+# Service PID exclusion — the core bug fix
+# ---------------------------------------------------------------------------
+
+
+class TestServicePidExclusion:
+ """After restarting a service, the stale-process sweep must NOT kill
+ the freshly-spawned service PID. This was the root cause of the bug
+ where ``hermes update`` would restart the gateway and immediately kill it.
+ """
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_launchd_does_not_kill_service_pid(
+ self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
+ ):
+ """After launchd restart, the sweep must exclude the service PID."""
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("")
+
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ # The service PID that launchd manages after restart
+ SERVICE_PID = 42000
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ launchctl_loaded=True,
+ )
+
+ # Simulate find_gateway_pids returning the service PID (the bug scenario)
+ # and _get_service_pids returning the same PID to exclude it
+ with patch.object(
+ gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
+ ), patch.object(
+ gateway_cli, "find_gateway_pids",
+ side_effect=lambda exclude_pids=None: (
+ [SERVICE_PID] if not exclude_pids else
+ [p for p in [SERVICE_PID] if p not in exclude_pids]
+ ),
+ ), patch("os.kill") as mock_kill:
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ # Service was restarted
+ assert "Restarted" in captured
+ # The service PID should NOT have been killed by the manual sweep
+ kill_calls = [
+ c for c in mock_kill.call_args_list
+ if c.args[0] == SERVICE_PID
+ ]
+ assert len(kill_calls) == 0, (
+ f"Service PID {SERVICE_PID} was killed by the manual sweep — "
+ f"this is the bug where update restarts then immediately kills the gateway"
+ )
+ # Should NOT show manual restart message
+ assert "Restart manually" not in captured
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_systemd_does_not_kill_service_pid(
+ self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
+ ):
+ """After systemd restart, the sweep must exclude the service PID."""
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
+
+ SERVICE_PID = 55000
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ systemd_active=True,
+ )
+
+ with patch.object(
+ gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
+ ), patch.object(
+ gateway_cli, "find_gateway_pids",
+ side_effect=lambda exclude_pids=None: (
+ [SERVICE_PID] if not exclude_pids else
+ [p for p in [SERVICE_PID] if p not in exclude_pids]
+ ),
+ ), patch("os.kill") as mock_kill:
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Restarted hermes-gateway" in captured
+ # Service PID must not be killed
+ kill_calls = [
+ c for c in mock_kill.call_args_list
+ if c.args[0] == SERVICE_PID
+ ]
+ assert len(kill_calls) == 0
+ assert "Restart manually" not in captured
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_kills_manual_pid_but_not_service_pid(
+ self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
+ ):
+ """When both a service PID and a manual PID exist, only the manual one
+ is killed."""
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("")
+
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ SERVICE_PID = 42000
+ MANUAL_PID = 42999
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ launchctl_loaded=True,
+ )
+
+ def fake_find(exclude_pids=None):
+ _exclude = exclude_pids or set()
+ return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
+
+ with patch.object(
+ gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
+ ), patch.object(
+ gateway_cli, "find_gateway_pids", side_effect=fake_find,
+ ), patch("os.kill") as mock_kill:
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Restarted" in captured
+ # Manual PID should be killed
+ manual_kills = [c for c in mock_kill.call_args_list if c.args[0] == MANUAL_PID]
+ assert len(manual_kills) == 1
+ # Service PID should NOT be killed
+ service_kills = [c for c in mock_kill.call_args_list if c.args[0] == SERVICE_PID]
+ assert len(service_kills) == 0
+ # Should show manual stop message since manual PID was killed
+ assert "Stopped 1 manual gateway" in captured
+
+
+class TestGetServicePids:
+ """Unit tests for _get_service_pids()."""
+
+ def test_returns_systemd_main_pid(self, monkeypatch):
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
+
+ def fake_run(cmd, **kwargs):
+ joined = " ".join(str(c) for c in cmd)
+ if "list-units" in joined:
+ return subprocess.CompletedProcess(
+ cmd, 0,
+ stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
+ stderr="",
+ )
+ if "show" in joined and "MainPID" in joined:
+ return subprocess.CompletedProcess(cmd, 0, stdout="12345\n", stderr="")
+ return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ pids = gateway_cli._get_service_pids()
+ assert 12345 in pids
+
+ def test_returns_launchd_pid(self, monkeypatch):
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
+
+ def fake_run(cmd, **kwargs):
+ joined = " ".join(str(c) for c in cmd)
+ if "launchctl" in joined and "list" in joined:
+ return subprocess.CompletedProcess(
+ cmd, 0,
+ stdout="PID\tStatus\tLabel\n67890\t0\tai.hermes.gateway\n",
+ stderr="",
+ )
+ return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ pids = gateway_cli._get_service_pids()
+ assert 67890 in pids
+
+ def test_returns_empty_when_no_services(self, monkeypatch):
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
+
+ pids = gateway_cli._get_service_pids()
+ assert pids == set()
+
+ def test_excludes_zero_pid(self, monkeypatch):
+ """systemd returns MainPID=0 for stopped services; skip those."""
+ monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
+ monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
+
+ def fake_run(cmd, **kwargs):
+ joined = " ".join(str(c) for c in cmd)
+ if "list-units" in joined:
+ return subprocess.CompletedProcess(
+ cmd, 0,
+ stdout="hermes-gateway.service loaded inactive dead Hermes Gateway\n",
+ stderr="",
+ )
+ if "show" in joined and "MainPID" in joined:
+ return subprocess.CompletedProcess(cmd, 0, stdout="0\n", stderr="")
+ return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ pids = gateway_cli._get_service_pids()
+ assert 0 not in pids
+ assert pids == set()
+
+
+class TestFindGatewayPidsExclude:
+ """find_gateway_pids respects exclude_pids."""
+
+ def test_excludes_specified_pids(self, monkeypatch):
+ monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
+
+ def fake_run(cmd, **kwargs):
+ return subprocess.CompletedProcess(
+ cmd, 0,
+ stdout=(
+ "user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
+ "user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
+ ),
+ stderr="",
+ )
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+ monkeypatch.setattr("os.getpid", lambda: 999)
+
+ pids = gateway_cli.find_gateway_pids(exclude_pids={100})
+ assert 100 not in pids
+ assert 200 in pids
+
+ def test_no_exclude_returns_all(self, monkeypatch):
+ monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
+
+ def fake_run(cmd, **kwargs):
+ return subprocess.CompletedProcess(
+ cmd, 0,
+ stdout=(
+ "user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
+ "user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
+ ),
+ stderr="",
+ )
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+ monkeypatch.setattr("os.getpid", lambda: 999)
+
+ pids = gateway_cli.find_gateway_pids()
+ assert 100 in pids
+ assert 200 in pids