Merge pull request #1614 from PeterFile/fix/launchd-service-recovery

fix(gateway): recover stale launchd service state
This commit is contained in:
Teknium
2026-03-17 01:43:07 -07:00
committed by GitHub
2 changed files with 162 additions and 3 deletions

View File

@@ -562,6 +562,12 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
scope_flag = " --system" if system else ""
if unit_path.exists() and not force:
if not systemd_unit_is_current(system=system):
print(f"↻ Repairing outdated {_service_scope_label(system)} systemd service at: {unit_path}")
refresh_systemd_unit_if_needed(system=system)
subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True)
print(f"{_service_scope_label(system).capitalize()} service definition updated")
return
print(f"Service already installed at: {unit_path}")
print("Use --force to reinstall")
return
@@ -787,6 +793,11 @@ def launchd_install(force: bool = False):
plist_path = get_launchd_plist_path()
if plist_path.exists() and not force:
if not launchd_plist_is_current():
print(f"↻ Repairing outdated launchd service at: {plist_path}")
refresh_launchd_plist_if_needed()
print("✓ Service definition updated")
return
print(f"Service already installed at: {plist_path}")
print("Use --force to reinstall")
return
@@ -816,7 +827,15 @@ def launchd_uninstall():
def launchd_start():
refresh_launchd_plist_if_needed()
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
plist_path = get_launchd_plist_path()
try:
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
except subprocess.CalledProcessError as e:
if e.returncode != 3 or not plist_path.exists():
raise
print("↻ launchd job was unloaded; reloading service definition")
subprocess.run(["launchctl", "load", str(plist_path)], check=True)
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
print("✓ Service started")
def launchd_stop():
@@ -824,22 +843,36 @@ def launchd_stop():
print("✓ Service stopped")
def launchd_restart():
refresh_launchd_plist_if_needed()
launchd_stop()
try:
launchd_stop()
except subprocess.CalledProcessError as e:
if e.returncode != 3:
raise
print("↻ launchd job was unloaded; skipping stop")
launchd_start()
def launchd_status(deep: bool = False):
plist_path = get_launchd_plist_path()
result = subprocess.run(
["launchctl", "list", "ai.hermes.gateway"],
capture_output=True,
text=True
)
print(f"Launchd plist: {plist_path}")
if launchd_plist_is_current():
print("✓ Service definition matches the current Hermes install")
else:
print("⚠ Service definition is stale relative to the current Hermes install")
print(" Run: hermes gateway start")
if result.returncode == 0:
print("✓ Gateway service is loaded")
print(result.stdout)
else:
print("✗ Gateway service is not loaded")
print(" Service definition exists locally but launchd has not loaded it.")
print(" Run: hermes gateway start")
if deep:
log_file = get_hermes_home() / "logs" / "gateway.log"
@@ -1555,14 +1588,17 @@ def gateway_command(args):
# Try service first, fall back to killing and restarting
service_available = False
system = getattr(args, 'system', False)
service_configured = False
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
service_configured = True
try:
systemd_restart(system=system)
service_available = True
except subprocess.CalledProcessError:
pass
elif is_macos() and get_launchd_plist_path().exists():
service_configured = True
try:
launchd_restart()
service_available = True
@@ -1586,6 +1622,13 @@ def gateway_command(args):
print(" hermes gateway restart")
return
if service_configured:
print()
print("✗ Gateway service restart failed.")
print(" The service definition exists, but the service manager did not recover it.")
print(" Fix the service, then retry: hermes gateway start")
sys.exit(1)
# Manual restart: kill existing processes
killed = kill_gateway_processes()
if killed:

View File

@@ -7,6 +7,29 @@ import hermes_cli.gateway as gateway_cli
class TestSystemdServiceRefresh:
def test_systemd_install_repairs_outdated_unit_without_force(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("old unit\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n")
calls = []
def fake_run(cmd, check=True, **kwargs):
calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.systemd_install()
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
assert calls[:2] == [
["systemctl", "--user", "daemon-reload"],
["systemctl", "--user", "enable", gateway_cli.get_service_name()],
]
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
unit_path = tmp_path / "hermes-gateway.service"
unit_path.write_text("old unit\n", encoding="utf-8")
@@ -96,6 +119,71 @@ class TestGatewayStopCleanup:
assert kill_calls == [False]
class TestLaunchdServiceRecovery:
def test_launchd_install_repairs_outdated_plist_without_force(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
calls = []
def fake_run(cmd, check=False, **kwargs):
calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.launchd_install()
assert "--replace" in plist_path.read_text(encoding="utf-8")
assert calls[:2] == [
["launchctl", "unload", str(plist_path)],
["launchctl", "load", str(plist_path)],
]
def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
calls = []
def fake_run(cmd, check=False, **kwargs):
calls.append(cmd)
if cmd == ["launchctl", "start", "ai.hermes.gateway"] and calls.count(cmd) == 1:
raise gateway_cli.subprocess.CalledProcessError(3, cmd, stderr="Could not find service")
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.launchd_start()
assert calls == [
["launchctl", "start", "ai.hermes.gateway"],
["launchctl", "load", str(plist_path)],
["launchctl", "start", "ai.hermes.gateway"],
]
def test_launchd_status_reports_local_stale_plist_when_unloaded(self, tmp_path, monkeypatch, capsys):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(
gateway_cli.subprocess,
"run",
lambda *args, **kwargs: SimpleNamespace(returncode=113, stdout="", stderr="Could not find service"),
)
gateway_cli.launchd_status()
output = capsys.readouterr().out
assert str(plist_path) in output
assert "stale" in output.lower()
assert "not loaded" in output.lower()
class TestGatewayServiceDetection:
def test_is_service_running_checks_system_scope_when_user_scope_is_inactive(self, monkeypatch):
user_unit = SimpleNamespace(exists=lambda: True)
@@ -158,6 +246,34 @@ class TestGatewaySystemServiceRouting:
assert calls == [(False, False)]
def test_gateway_restart_does_not_fallback_to_foreground_when_launchd_restart_fails(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("plist\n", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(
gateway_cli,
"launchd_restart",
lambda: (_ for _ in ()).throw(
gateway_cli.subprocess.CalledProcessError(5, ["launchctl", "start", "ai.hermes.gateway"])
),
)
run_calls = []
monkeypatch.setattr(gateway_cli, "run_gateway", lambda verbose=False, replace=False: run_calls.append((verbose, replace)))
monkeypatch.setattr(gateway_cli, "kill_gateway_processes", lambda force=False: 0)
try:
gateway_cli.gateway_command(SimpleNamespace(gateway_command="restart", system=False))
except SystemExit as exc:
assert exc.code == 1
else:
raise AssertionError("Expected gateway_command to exit when service restart fails")
assert run_calls == []
class TestEnsureUserSystemdEnv:
"""Tests for _ensure_user_systemd_env() D-Bus session bus auto-detection."""