From 014a5b712d4cdc352b68e2510e11d0a372fd4a3d Mon Sep 17 00:00:00 2001 From: teknium1 Date: Thu, 5 Mar 2026 20:35:33 -0800 Subject: [PATCH] fix: prevent duplicate gateway instances from running simultaneously start_gateway() now checks for an existing running instance via PID file before starting. If another gateway is already running under the same HERMES_HOME, it refuses to start with a clear error message directing the user to 'hermes gateway restart' or 'hermes gateway stop'. Also fixes gateway/status.py to respect the HERMES_HOME env var instead of hardcoding ~/.hermes. This scopes the PID file per HERMES_HOME directory, which lays the groundwork for future multi-profile support where distinct HERMES_HOME directories can run concurrent gateway instances independently. --- gateway/run.py | 21 ++++++++++++++++++++ gateway/status.py | 50 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 2ed9ed8c2..3af04f1ea 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2389,6 +2389,27 @@ async def start_gateway(config: Optional[GatewayConfig] = None) -> bool: Returns True if the gateway ran successfully, False if it failed to start. A False return causes a non-zero exit code so systemd can auto-restart. """ + # ── Duplicate-instance guard ────────────────────────────────────── + # Prevent two gateways from running under the same HERMES_HOME. + # The PID file is scoped to HERMES_HOME, so future multi-profile + # setups (each profile using a distinct HERMES_HOME) will naturally + # allow concurrent instances without tripping this guard. + from gateway.status import get_running_pid + existing_pid = get_running_pid() + if existing_pid is not None and existing_pid != os.getpid(): + hermes_home = os.getenv("HERMES_HOME", "~/.hermes") + logger.error( + "Another gateway instance is already running (PID %d, HERMES_HOME=%s). " + "Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.", + existing_pid, hermes_home, + ) + print( + f"\n❌ Gateway already running (PID {existing_pid}).\n" + f" Use 'hermes gateway restart' to replace it,\n" + f" or 'hermes gateway stop' to kill it first.\n" + ) + return False + # Configure rotating file log so gateway output is persisted for debugging log_dir = _hermes_home / 'logs' log_dir.mkdir(parents=True, exist_ok=True) diff --git a/gateway/status.py b/gateway/status.py index f28adc880..78d71947f 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -3,37 +3,59 @@ Gateway runtime status helpers. Provides PID-file based detection of whether the gateway daemon is running, used by send_message's check_fn to gate availability in the CLI. + +The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to +``~/.hermes`` but can be overridden via the environment variable. This means +separate HERMES_HOME directories naturally get separate PID files — a property +that will be useful when we add named profiles (multiple agents running +concurrently under distinct configurations). """ import os from pathlib import Path +from typing import Optional -_PID_FILE = Path.home() / ".hermes" / "gateway.pid" + +def _get_pid_path() -> Path: + """Return the path to the gateway PID file, respecting HERMES_HOME.""" + home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) + return home / "gateway.pid" def write_pid_file() -> None: """Write the current process PID to the gateway PID file.""" - _PID_FILE.parent.mkdir(parents=True, exist_ok=True) - _PID_FILE.write_text(str(os.getpid())) + pid_path = _get_pid_path() + pid_path.parent.mkdir(parents=True, exist_ok=True) + pid_path.write_text(str(os.getpid())) def remove_pid_file() -> None: """Remove the gateway PID file if it exists.""" try: - _PID_FILE.unlink(missing_ok=True) + _get_pid_path().unlink(missing_ok=True) except Exception: pass +def get_running_pid() -> Optional[int]: + """Return the PID of a running gateway instance, or ``None``. + + Checks the PID file and verifies the process is actually alive. + Cleans up stale PID files automatically. + """ + pid_path = _get_pid_path() + if not pid_path.exists(): + return None + try: + pid = int(pid_path.read_text().strip()) + os.kill(pid, 0) # signal 0 = existence check, no actual signal sent + return pid + except (ValueError, ProcessLookupError, PermissionError): + # Stale PID file — process is gone + remove_pid_file() + return None + + def is_gateway_running() -> bool: """Check if the gateway daemon is currently running.""" - if not _PID_FILE.exists(): - return False - try: - pid = int(_PID_FILE.read_text().strip()) - os.kill(pid, 0) # signal 0 = existence check, no actual signal sent - return True - except (ValueError, ProcessLookupError, PermissionError): - # Stale PID file -- process is gone - remove_pid_file() - return False + return get_running_pid() is not None