fix: write update exit code before gateway restart (cgroup kill race) (#8288)

When /update runs via Telegram, hermes update --gateway is spawned inside
the gateway's systemd cgroup.  The update process itself calls
systemctl restart hermes-gateway, which tears down the cgroup with
KillMode=mixed — SIGKILL to all remaining processes.  The wrapping bash
shell is killed before it can execute the exit-code epilogue, so
.update_exit_code is never created.  The new gateway's update watcher
then polls for 30 minutes and sends a spurious timeout message.

Fix: write .update_exit_code from Python inside cmd_update() immediately
after the git pull + pip install succeed ("Update complete!"), before
attempting the gateway restart.  The shell epilogue still writes it too
(idempotent overwrite), but now the marker exists even when the process
is killed mid-restart.
This commit is contained in:
Teknium
2026-04-12 02:33:21 -07:00
committed by GitHub
parent b321330362
commit 56e3ee2440
2 changed files with 137 additions and 0 deletions

View File

@@ -3939,6 +3939,26 @@ def cmd_update(args):
print()
print("✓ Update complete!")
# Write exit code *before* the gateway restart attempt.
# When running as ``hermes update --gateway`` (spawned by the gateway's
# /update command), this process lives inside the gateway's systemd
# cgroup. ``systemctl restart hermes-gateway`` kills everything in the
# cgroup (KillMode=mixed → SIGKILL to remaining processes), including
# us and the wrapping bash shell. The shell never reaches its
# ``printf $status > .update_exit_code`` epilogue, so the exit-code
# marker file is never created. The new gateway's update watcher then
# polls for 30 minutes and sends a spurious timeout message.
#
# Writing the marker here — after git pull + pip install succeed but
# before we attempt the restart — ensures the new gateway sees it
# regardless of how we die.
if gateway_mode:
_exit_code_path = get_hermes_home() / ".update_exit_code"
try:
_exit_code_path.write_text("0")
except OSError:
pass
# Auto-restart ALL gateways after update.
# The code update (git pull) is shared across all profiles, so every
# running gateway needs restarting to pick up the new code.

View File

@@ -798,3 +798,120 @@ class TestFindGatewayPidsExclude:
pids = gateway_cli.find_gateway_pids()
assert pids == [100]
# ---------------------------------------------------------------------------
# Gateway mode writes exit code before restart (#8300)
# ---------------------------------------------------------------------------
class TestGatewayModeWritesExitCodeEarly:
"""When running as ``hermes update --gateway``, the exit code marker must be
written *before* the gateway restart attempt. Without this, systemd's
``KillMode=mixed`` kills the update process (and its wrapping shell) during
the cgroup teardown, so the shell epilogue that normally writes the exit
code never executes. The new gateway's update watcher then polls for 30
minutes and sends a spurious timeout message.
"""
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_exit_code_written_in_gateway_mode(
self, mock_run, _mock_which, capsys, tmp_path, monkeypatch,
):
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
# Point HERMES_HOME at a temp dir so the marker file lands there
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
import hermes_cli.config as _cfg
monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home)
# Also patch the module-level ref used by cmd_update
import hermes_cli.main as _main_mod
monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home)
mock_run.side_effect = _make_run_side_effect(commit_count="1")
args = SimpleNamespace(gateway=True)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(args)
exit_code_path = hermes_home / ".update_exit_code"
assert exit_code_path.exists(), ".update_exit_code not written in gateway mode"
assert exit_code_path.read_text() == "0"
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_exit_code_not_written_in_normal_mode(
self, mock_run, _mock_which, capsys, tmp_path, monkeypatch,
):
"""Non-gateway mode should NOT write the exit code (the shell does it)."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
import hermes_cli.config as _cfg
monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home)
import hermes_cli.main as _main_mod
monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home)
mock_run.side_effect = _make_run_side_effect(commit_count="1")
args = SimpleNamespace(gateway=False)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(args)
exit_code_path = hermes_home / ".update_exit_code"
assert not exit_code_path.exists(), ".update_exit_code should not be written outside gateway mode"
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_exit_code_written_before_restart_call(
self, mock_run, _mock_which, capsys, tmp_path, monkeypatch,
):
"""Exit code must exist BEFORE systemctl restart is called."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
import hermes_cli.config as _cfg
monkeypatch.setattr(_cfg, "get_hermes_home", lambda: hermes_home)
import hermes_cli.main as _main_mod
monkeypatch.setattr(_main_mod, "get_hermes_home", lambda: hermes_home)
exit_code_path = hermes_home / ".update_exit_code"
# Track whether exit code exists when systemctl restart is called
exit_code_existed_at_restart = []
original_side_effect = _make_run_side_effect(
commit_count="1", systemd_active=True,
)
def tracking_side_effect(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "systemctl" in joined and "restart" in joined:
exit_code_existed_at_restart.append(exit_code_path.exists())
return original_side_effect(cmd, **kwargs)
mock_run.side_effect = tracking_side_effect
args = SimpleNamespace(gateway=True)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(args)
assert exit_code_existed_at_restart, "systemctl restart was never called"
assert exit_code_existed_at_restart[0] is True, \
".update_exit_code must exist BEFORE systemctl restart (cgroup kill race)"