diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 93f3a9358..f328d03b7 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -28,9 +28,101 @@ from hermes_cli.colors import Colors, color # Process Management (for manual gateway runs) # ============================================================================= -def find_gateway_pids() -> list: - """Find PIDs of running gateway processes.""" +def _get_service_pids() -> set: + """Return PIDs currently managed by systemd or launchd gateway services. + + Used to avoid killing freshly-restarted service processes when sweeping + for stale manual gateway processes after a service restart. + """ + pids: set = set() + + # --- systemd (Linux) --- + if is_linux(): + try: + result = subprocess.run( + ["systemctl", "--user", "list-units", "hermes-gateway*", + "--plain", "--no-legend", "--no-pager"], + capture_output=True, text=True, timeout=5, + ) + for line in result.stdout.strip().splitlines(): + parts = line.split() + if not parts or not parts[0].endswith(".service"): + continue + svc = parts[0] + try: + show = subprocess.run( + ["systemctl", "--user", "show", svc, + "--property=MainPID", "--value"], + capture_output=True, text=True, timeout=5, + ) + pid = int(show.stdout.strip()) + if pid > 0: + pids.add(pid) + except (ValueError, subprocess.TimeoutExpired): + pass + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + # Also check system scope + try: + result = subprocess.run( + ["systemctl", "list-units", "hermes-gateway*", + "--plain", "--no-legend", "--no-pager"], + capture_output=True, text=True, timeout=5, + ) + for line in result.stdout.strip().splitlines(): + parts = line.split() + if not parts or not parts[0].endswith(".service"): + continue + svc = parts[0] + try: + show = subprocess.run( + ["systemctl", "show", svc, + "--property=MainPID", "--value"], + capture_output=True, text=True, timeout=5, + ) + pid = int(show.stdout.strip()) + if pid > 0: + pids.add(pid) + except (ValueError, subprocess.TimeoutExpired): + pass + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + # --- launchd (macOS) --- + if is_macos(): + try: + from hermes_cli.gateway import get_launchd_label + result = subprocess.run( + ["launchctl", "list", get_launchd_label()], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0: + # Output format: "PID\tStatus\tLabel" header then data line + for line in result.stdout.strip().splitlines(): + parts = line.split() + if parts: + try: + pid = int(parts[0]) + if pid > 0: + pids.add(pid) + except ValueError: + pass + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + return pids + + +def find_gateway_pids(exclude_pids: set | None = None) -> list: + """Find PIDs of running gateway processes. + + Args: + exclude_pids: PIDs to exclude from the result (e.g. service-managed + PIDs that should not be killed during a stale-process sweep). + """ pids = [] + _exclude = exclude_pids or set() patterns = [ "hermes_cli.main gateway", "hermes_cli/main.py gateway", @@ -56,7 +148,7 @@ def find_gateway_pids() -> list: if any(p in current_cmd for p in patterns): try: pid = int(pid_str) - if pid != os.getpid() and pid not in pids: + if pid != os.getpid() and pid not in pids and pid not in _exclude: pids.append(pid) except ValueError: pass @@ -78,7 +170,7 @@ def find_gateway_pids() -> list: if len(parts) > 1: try: pid = int(parts[1]) - if pid not in pids: + if pid not in pids and pid not in _exclude: pids.append(pid) except ValueError: continue @@ -89,9 +181,15 @@ def find_gateway_pids() -> list: return pids -def kill_gateway_processes(force: bool = False) -> int: - """Kill ALL running gateway processes (across all profiles). Returns count killed.""" - pids = find_gateway_pids() +def kill_gateway_processes(force: bool = False, exclude_pids: set | None = None) -> int: + """Kill any running gateway processes. Returns count killed. + + Args: + force: Use SIGKILL instead of SIGTERM. + exclude_pids: PIDs to skip (e.g. service-managed PIDs that were just + restarted and should not be killed). + """ + pids = find_gateway_pids(exclude_pids=exclude_pids) killed = 0 for pid in pids: diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 5994e5cea..ad5d5b036 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -3607,6 +3607,7 @@ def cmd_update(args): from hermes_cli.gateway import ( is_macos, is_linux, _ensure_user_systemd_env, get_systemd_linger_status, find_gateway_pids, + _get_service_pids, ) import signal as _signal @@ -3673,8 +3674,11 @@ def cmd_update(args): pass # --- Manual (non-service) gateways --- - # Kill any remaining gateway processes not managed by a service - manual_pids = find_gateway_pids() + # Kill any remaining gateway processes not managed by a service. + # Exclude PIDs that belong to just-restarted services so we don't + # immediately kill the process that systemd/launchd just spawned. + service_pids = _get_service_pids() + manual_pids = find_gateway_pids(exclude_pids=service_pids) for pid in manual_pids: try: os.kill(pid, _signal.SIGTERM) diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index ca25c05a7..d716cfb50 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -491,3 +491,263 @@ class TestCmdUpdateSystemService: captured = capsys.readouterr().out # Both scopes are discovered and restarted assert "Restarted hermes-gateway" in captured + + +# --------------------------------------------------------------------------- +# Service PID exclusion — the core bug fix +# --------------------------------------------------------------------------- + + +class TestServicePidExclusion: + """After restarting a service, the stale-process sweep must NOT kill + the freshly-spawned service PID. This was the root cause of the bug + where ``hermes update`` would restart the gateway and immediately kill it. + """ + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_launchd_does_not_kill_service_pid( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path, + ): + """After launchd restart, the sweep must exclude the service PID.""" + plist_path = tmp_path / "ai.hermes.gateway.plist" + plist_path.write_text("") + + monkeypatch.setattr(gateway_cli, "is_macos", lambda: True) + monkeypatch.setattr(gateway_cli, "is_linux", lambda: False) + monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + + # The service PID that launchd manages after restart + SERVICE_PID = 42000 + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + launchctl_loaded=True, + ) + + # Simulate find_gateway_pids returning the service PID (the bug scenario) + # and _get_service_pids returning the same PID to exclude it + with patch.object( + gateway_cli, "_get_service_pids", return_value={SERVICE_PID} + ), patch.object( + gateway_cli, "find_gateway_pids", + side_effect=lambda exclude_pids=None: ( + [SERVICE_PID] if not exclude_pids else + [p for p in [SERVICE_PID] if p not in exclude_pids] + ), + ), patch("os.kill") as mock_kill: + cmd_update(mock_args) + + captured = capsys.readouterr().out + # Service was restarted + assert "Restarted" in captured + # The service PID should NOT have been killed by the manual sweep + kill_calls = [ + c for c in mock_kill.call_args_list + if c.args[0] == SERVICE_PID + ] + assert len(kill_calls) == 0, ( + f"Service PID {SERVICE_PID} was killed by the manual sweep — " + f"this is the bug where update restarts then immediately kills the gateway" + ) + # Should NOT show manual restart message + assert "Restart manually" not in captured + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_systemd_does_not_kill_service_pid( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, + ): + """After systemd restart, the sweep must exclude the service PID.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "is_linux", lambda: True) + + SERVICE_PID = 55000 + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + systemd_active=True, + ) + + with patch.object( + gateway_cli, "_get_service_pids", return_value={SERVICE_PID} + ), patch.object( + gateway_cli, "find_gateway_pids", + side_effect=lambda exclude_pids=None: ( + [SERVICE_PID] if not exclude_pids else + [p for p in [SERVICE_PID] if p not in exclude_pids] + ), + ), patch("os.kill") as mock_kill: + cmd_update(mock_args) + + captured = capsys.readouterr().out + assert "Restarted hermes-gateway" in captured + # Service PID must not be killed + kill_calls = [ + c for c in mock_kill.call_args_list + if c.args[0] == SERVICE_PID + ] + assert len(kill_calls) == 0 + assert "Restart manually" not in captured + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_kills_manual_pid_but_not_service_pid( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path, + ): + """When both a service PID and a manual PID exist, only the manual one + is killed.""" + plist_path = tmp_path / "ai.hermes.gateway.plist" + plist_path.write_text("") + + monkeypatch.setattr(gateway_cli, "is_macos", lambda: True) + monkeypatch.setattr(gateway_cli, "is_linux", lambda: False) + monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + + SERVICE_PID = 42000 + MANUAL_PID = 42999 + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + launchctl_loaded=True, + ) + + def fake_find(exclude_pids=None): + _exclude = exclude_pids or set() + return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude] + + with patch.object( + gateway_cli, "_get_service_pids", return_value={SERVICE_PID} + ), patch.object( + gateway_cli, "find_gateway_pids", side_effect=fake_find, + ), patch("os.kill") as mock_kill: + cmd_update(mock_args) + + captured = capsys.readouterr().out + assert "Restarted" in captured + # Manual PID should be killed + manual_kills = [c for c in mock_kill.call_args_list if c.args[0] == MANUAL_PID] + assert len(manual_kills) == 1 + # Service PID should NOT be killed + service_kills = [c for c in mock_kill.call_args_list if c.args[0] == SERVICE_PID] + assert len(service_kills) == 0 + # Should show manual stop message since manual PID was killed + assert "Stopped 1 manual gateway" in captured + + +class TestGetServicePids: + """Unit tests for _get_service_pids().""" + + def test_returns_systemd_main_pid(self, monkeypatch): + monkeypatch.setattr(gateway_cli, "is_linux", lambda: True) + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + + def fake_run(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "list-units" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded active running Hermes Gateway\n", + stderr="", + ) + if "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="12345\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + pids = gateway_cli._get_service_pids() + assert 12345 in pids + + def test_returns_launchd_pid(self, monkeypatch): + monkeypatch.setattr(gateway_cli, "is_linux", lambda: False) + monkeypatch.setattr(gateway_cli, "is_macos", lambda: True) + + def fake_run(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "launchctl" in joined and "list" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="PID\tStatus\tLabel\n67890\t0\tai.hermes.gateway\n", + stderr="", + ) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + pids = gateway_cli._get_service_pids() + assert 67890 in pids + + def test_returns_empty_when_no_services(self, monkeypatch): + monkeypatch.setattr(gateway_cli, "is_linux", lambda: False) + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + + pids = gateway_cli._get_service_pids() + assert pids == set() + + def test_excludes_zero_pid(self, monkeypatch): + """systemd returns MainPID=0 for stopped services; skip those.""" + monkeypatch.setattr(gateway_cli, "is_linux", lambda: True) + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + + def fake_run(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "list-units" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded inactive dead Hermes Gateway\n", + stderr="", + ) + if "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="0\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + pids = gateway_cli._get_service_pids() + assert 0 not in pids + assert pids == set() + + +class TestFindGatewayPidsExclude: + """find_gateway_pids respects exclude_pids.""" + + def test_excludes_specified_pids(self, monkeypatch): + monkeypatch.setattr(gateway_cli, "is_windows", lambda: False) + + def fake_run(cmd, **kwargs): + return subprocess.CompletedProcess( + cmd, 0, + stdout=( + "user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n" + "user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n" + ), + stderr="", + ) + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + monkeypatch.setattr("os.getpid", lambda: 999) + + pids = gateway_cli.find_gateway_pids(exclude_pids={100}) + assert 100 not in pids + assert 200 in pids + + def test_no_exclude_returns_all(self, monkeypatch): + monkeypatch.setattr(gateway_cli, "is_windows", lambda: False) + + def fake_run(cmd, **kwargs): + return subprocess.CompletedProcess( + cmd, 0, + stdout=( + "user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n" + "user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n" + ), + stderr="", + ) + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + monkeypatch.setattr("os.getpid", lambda: 999) + + pids = gateway_cli.find_gateway_pids() + assert 100 in pids + assert 200 in pids