fix(gateway): Recover stale service state
Repair stale launchd/systemd definitions during install and teach launchd start to reload unloaded jobs before retrying. Stop masking service restart failures by falling back to a foreground gateway when a configured service manager is still broken. Refs: #1613
This commit is contained in:
@@ -562,6 +562,12 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
|
|||||||
scope_flag = " --system" if system else ""
|
scope_flag = " --system" if system else ""
|
||||||
|
|
||||||
if unit_path.exists() and not force:
|
if unit_path.exists() and not force:
|
||||||
|
if not systemd_unit_is_current(system=system):
|
||||||
|
print(f"↻ Repairing outdated {_service_scope_label(system)} systemd service at: {unit_path}")
|
||||||
|
refresh_systemd_unit_if_needed(system=system)
|
||||||
|
subprocess.run(_systemctl_cmd(system) + ["enable", get_service_name()], check=True)
|
||||||
|
print(f"✓ {_service_scope_label(system).capitalize()} service definition updated")
|
||||||
|
return
|
||||||
print(f"Service already installed at: {unit_path}")
|
print(f"Service already installed at: {unit_path}")
|
||||||
print("Use --force to reinstall")
|
print("Use --force to reinstall")
|
||||||
return
|
return
|
||||||
@@ -787,6 +793,11 @@ def launchd_install(force: bool = False):
|
|||||||
plist_path = get_launchd_plist_path()
|
plist_path = get_launchd_plist_path()
|
||||||
|
|
||||||
if plist_path.exists() and not force:
|
if plist_path.exists() and not force:
|
||||||
|
if not launchd_plist_is_current():
|
||||||
|
print(f"↻ Repairing outdated launchd service at: {plist_path}")
|
||||||
|
refresh_launchd_plist_if_needed()
|
||||||
|
print("✓ Service definition updated")
|
||||||
|
return
|
||||||
print(f"Service already installed at: {plist_path}")
|
print(f"Service already installed at: {plist_path}")
|
||||||
print("Use --force to reinstall")
|
print("Use --force to reinstall")
|
||||||
return
|
return
|
||||||
@@ -816,7 +827,15 @@ def launchd_uninstall():
|
|||||||
|
|
||||||
def launchd_start():
|
def launchd_start():
|
||||||
refresh_launchd_plist_if_needed()
|
refresh_launchd_plist_if_needed()
|
||||||
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
|
plist_path = get_launchd_plist_path()
|
||||||
|
try:
|
||||||
|
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
if e.returncode != 3 or not plist_path.exists():
|
||||||
|
raise
|
||||||
|
print("↻ launchd job was unloaded; reloading service definition")
|
||||||
|
subprocess.run(["launchctl", "load", str(plist_path)], check=True)
|
||||||
|
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
|
||||||
print("✓ Service started")
|
print("✓ Service started")
|
||||||
|
|
||||||
def launchd_stop():
|
def launchd_stop():
|
||||||
@@ -824,22 +843,36 @@ def launchd_stop():
|
|||||||
print("✓ Service stopped")
|
print("✓ Service stopped")
|
||||||
|
|
||||||
def launchd_restart():
|
def launchd_restart():
|
||||||
refresh_launchd_plist_if_needed()
|
try:
|
||||||
launchd_stop()
|
launchd_stop()
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
if e.returncode != 3:
|
||||||
|
raise
|
||||||
|
print("↻ launchd job was unloaded; skipping stop")
|
||||||
launchd_start()
|
launchd_start()
|
||||||
|
|
||||||
def launchd_status(deep: bool = False):
|
def launchd_status(deep: bool = False):
|
||||||
|
plist_path = get_launchd_plist_path()
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["launchctl", "list", "ai.hermes.gateway"],
|
["launchctl", "list", "ai.hermes.gateway"],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True
|
text=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print(f"Launchd plist: {plist_path}")
|
||||||
|
if launchd_plist_is_current():
|
||||||
|
print("✓ Service definition matches the current Hermes install")
|
||||||
|
else:
|
||||||
|
print("⚠ Service definition is stale relative to the current Hermes install")
|
||||||
|
print(" Run: hermes gateway start")
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
print("✓ Gateway service is loaded")
|
print("✓ Gateway service is loaded")
|
||||||
print(result.stdout)
|
print(result.stdout)
|
||||||
else:
|
else:
|
||||||
print("✗ Gateway service is not loaded")
|
print("✗ Gateway service is not loaded")
|
||||||
|
print(" Service definition exists locally but launchd has not loaded it.")
|
||||||
|
print(" Run: hermes gateway start")
|
||||||
|
|
||||||
if deep:
|
if deep:
|
||||||
log_file = get_hermes_home() / "logs" / "gateway.log"
|
log_file = get_hermes_home() / "logs" / "gateway.log"
|
||||||
@@ -1555,14 +1588,17 @@ def gateway_command(args):
|
|||||||
# Try service first, fall back to killing and restarting
|
# Try service first, fall back to killing and restarting
|
||||||
service_available = False
|
service_available = False
|
||||||
system = getattr(args, 'system', False)
|
system = getattr(args, 'system', False)
|
||||||
|
service_configured = False
|
||||||
|
|
||||||
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
|
if is_linux() and (get_systemd_unit_path(system=False).exists() or get_systemd_unit_path(system=True).exists()):
|
||||||
|
service_configured = True
|
||||||
try:
|
try:
|
||||||
systemd_restart(system=system)
|
systemd_restart(system=system)
|
||||||
service_available = True
|
service_available = True
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
pass
|
pass
|
||||||
elif is_macos() and get_launchd_plist_path().exists():
|
elif is_macos() and get_launchd_plist_path().exists():
|
||||||
|
service_configured = True
|
||||||
try:
|
try:
|
||||||
launchd_restart()
|
launchd_restart()
|
||||||
service_available = True
|
service_available = True
|
||||||
@@ -1586,6 +1622,13 @@ def gateway_command(args):
|
|||||||
print(" hermes gateway restart")
|
print(" hermes gateway restart")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if service_configured:
|
||||||
|
print()
|
||||||
|
print("✗ Gateway service restart failed.")
|
||||||
|
print(" The service definition exists, but the service manager did not recover it.")
|
||||||
|
print(" Fix the service, then retry: hermes gateway start")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Manual restart: kill existing processes
|
# Manual restart: kill existing processes
|
||||||
killed = kill_gateway_processes()
|
killed = kill_gateway_processes()
|
||||||
if killed:
|
if killed:
|
||||||
|
|||||||
@@ -7,6 +7,29 @@ import hermes_cli.gateway as gateway_cli
|
|||||||
|
|
||||||
|
|
||||||
class TestSystemdServiceRefresh:
|
class TestSystemdServiceRefresh:
|
||||||
|
def test_systemd_install_repairs_outdated_unit_without_force(self, tmp_path, monkeypatch):
|
||||||
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
|
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda system=False: unit_path)
|
||||||
|
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n")
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=True, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
gateway_cli.systemd_install()
|
||||||
|
|
||||||
|
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||||
|
assert calls[:2] == [
|
||||||
|
["systemctl", "--user", "daemon-reload"],
|
||||||
|
["systemctl", "--user", "enable", gateway_cli.get_service_name()],
|
||||||
|
]
|
||||||
|
|
||||||
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
||||||
unit_path = tmp_path / "hermes-gateway.service"
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
unit_path.write_text("old unit\n", encoding="utf-8")
|
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||||
@@ -96,6 +119,71 @@ class TestGatewayStopCleanup:
|
|||||||
assert kill_calls == [False]
|
assert kill_calls == [False]
|
||||||
|
|
||||||
|
|
||||||
|
class TestLaunchdServiceRecovery:
|
||||||
|
def test_launchd_install_repairs_outdated_plist_without_force(self, tmp_path, monkeypatch):
|
||||||
|
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||||
|
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=False, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
gateway_cli.launchd_install()
|
||||||
|
|
||||||
|
assert "--replace" in plist_path.read_text(encoding="utf-8")
|
||||||
|
assert calls[:2] == [
|
||||||
|
["launchctl", "unload", str(plist_path)],
|
||||||
|
["launchctl", "load", str(plist_path)],
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch):
|
||||||
|
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||||
|
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=False, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
if cmd == ["launchctl", "start", "ai.hermes.gateway"] and calls.count(cmd) == 1:
|
||||||
|
raise gateway_cli.subprocess.CalledProcessError(3, cmd, stderr="Could not find service")
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
gateway_cli.launchd_start()
|
||||||
|
|
||||||
|
assert calls == [
|
||||||
|
["launchctl", "start", "ai.hermes.gateway"],
|
||||||
|
["launchctl", "load", str(plist_path)],
|
||||||
|
["launchctl", "start", "ai.hermes.gateway"],
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_launchd_status_reports_local_stale_plist_when_unloaded(self, tmp_path, monkeypatch, capsys):
|
||||||
|
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||||
|
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
gateway_cli.subprocess,
|
||||||
|
"run",
|
||||||
|
lambda *args, **kwargs: SimpleNamespace(returncode=113, stdout="", stderr="Could not find service"),
|
||||||
|
)
|
||||||
|
|
||||||
|
gateway_cli.launchd_status()
|
||||||
|
|
||||||
|
output = capsys.readouterr().out
|
||||||
|
assert str(plist_path) in output
|
||||||
|
assert "stale" in output.lower()
|
||||||
|
assert "not loaded" in output.lower()
|
||||||
|
|
||||||
|
|
||||||
class TestGatewayServiceDetection:
|
class TestGatewayServiceDetection:
|
||||||
def test_is_service_running_checks_system_scope_when_user_scope_is_inactive(self, monkeypatch):
|
def test_is_service_running_checks_system_scope_when_user_scope_is_inactive(self, monkeypatch):
|
||||||
user_unit = SimpleNamespace(exists=lambda: True)
|
user_unit = SimpleNamespace(exists=lambda: True)
|
||||||
@@ -158,6 +246,34 @@ class TestGatewaySystemServiceRouting:
|
|||||||
|
|
||||||
assert calls == [(False, False)]
|
assert calls == [(False, False)]
|
||||||
|
|
||||||
|
def test_gateway_restart_does_not_fallback_to_foreground_when_launchd_restart_fails(self, tmp_path, monkeypatch):
|
||||||
|
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||||
|
plist_path.write_text("plist\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
gateway_cli,
|
||||||
|
"launchd_restart",
|
||||||
|
lambda: (_ for _ in ()).throw(
|
||||||
|
gateway_cli.subprocess.CalledProcessError(5, ["launchctl", "start", "ai.hermes.gateway"])
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
run_calls = []
|
||||||
|
monkeypatch.setattr(gateway_cli, "run_gateway", lambda verbose=False, replace=False: run_calls.append((verbose, replace)))
|
||||||
|
monkeypatch.setattr(gateway_cli, "kill_gateway_processes", lambda force=False: 0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
gateway_cli.gateway_command(SimpleNamespace(gateway_command="restart", system=False))
|
||||||
|
except SystemExit as exc:
|
||||||
|
assert exc.code == 1
|
||||||
|
else:
|
||||||
|
raise AssertionError("Expected gateway_command to exit when service restart fails")
|
||||||
|
|
||||||
|
assert run_calls == []
|
||||||
|
|
||||||
|
|
||||||
class TestEnsureUserSystemdEnv:
|
class TestEnsureUserSystemdEnv:
|
||||||
"""Tests for _ensure_user_systemd_env() D-Bus session bus auto-detection."""
|
"""Tests for _ensure_user_systemd_env() D-Bus session bus auto-detection."""
|
||||||
|
|||||||
Reference in New Issue
Block a user