diff --git a/gateway/run.py b/gateway/run.py index 9fd5ac0b7..be64d13a1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2459,34 +2459,77 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, interval: int logger.info("Cron ticker stopped") -async def start_gateway(config: Optional[GatewayConfig] = None) -> bool: +async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = False) -> bool: """ Start the gateway and run until interrupted. This is the main entry point for running the gateway. Returns True if the gateway ran successfully, False if it failed to start. A False return causes a non-zero exit code so systemd can auto-restart. + + Args: + config: Optional gateway configuration override. + replace: If True, kill any existing gateway instance before starting. + Useful for systemd services to avoid restart-loop deadlocks + when the previous process hasn't fully exited yet. """ # ── Duplicate-instance guard ────────────────────────────────────── # Prevent two gateways from running under the same HERMES_HOME. # The PID file is scoped to HERMES_HOME, so future multi-profile # setups (each profile using a distinct HERMES_HOME) will naturally # allow concurrent instances without tripping this guard. - from gateway.status import get_running_pid + import time as _time + from gateway.status import get_running_pid, remove_pid_file existing_pid = get_running_pid() if existing_pid is not None and existing_pid != os.getpid(): - hermes_home = os.getenv("HERMES_HOME", "~/.hermes") - logger.error( - "Another gateway instance is already running (PID %d, HERMES_HOME=%s). " - "Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.", - existing_pid, hermes_home, - ) - print( - f"\n❌ Gateway already running (PID {existing_pid}).\n" - f" Use 'hermes gateway restart' to replace it,\n" - f" or 'hermes gateway stop' to kill it first.\n" - ) - return False + if replace: + logger.info( + "Replacing existing gateway instance (PID %d) with --replace.", + existing_pid, + ) + try: + os.kill(existing_pid, signal.SIGTERM) + except ProcessLookupError: + pass # Already gone + except PermissionError: + logger.error( + "Permission denied killing PID %d. Cannot replace.", + existing_pid, + ) + return False + # Wait up to 10 seconds for the old process to exit + for _ in range(20): + try: + os.kill(existing_pid, 0) + _time.sleep(0.5) + except (ProcessLookupError, PermissionError): + break # Process is gone + else: + # Still alive after 10s — force kill + logger.warning( + "Old gateway (PID %d) did not exit after SIGTERM, sending SIGKILL.", + existing_pid, + ) + try: + os.kill(existing_pid, signal.SIGKILL) + _time.sleep(0.5) + except (ProcessLookupError, PermissionError): + pass + remove_pid_file() + else: + hermes_home = os.getenv("HERMES_HOME", "~/.hermes") + logger.error( + "Another gateway instance is already running (PID %d, HERMES_HOME=%s). " + "Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.", + existing_pid, hermes_home, + ) + print( + f"\n❌ Gateway already running (PID {existing_pid}).\n" + f" Use 'hermes gateway restart' to replace it,\n" + f" or 'hermes gateway stop' to kill it first.\n" + f" Or use 'hermes gateway run --replace' to auto-replace.\n" + ) + return False # Sync bundled skills on gateway start (fast -- skips unchanged) try: diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 3cc4941ab..b2f5f57d0 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -154,19 +154,25 @@ def get_hermes_cli_path() -> str: # ============================================================================= def generate_systemd_unit() -> str: + import shutil python_path = get_python_path() working_dir = str(PROJECT_ROOT) + hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main" return f"""[Unit] Description={SERVICE_DESCRIPTION} After=network.target [Service] Type=simple -ExecStart={python_path} -m hermes_cli.main gateway run +ExecStart={python_path} -m hermes_cli.main gateway run --replace +ExecStop={hermes_cli} gateway stop WorkingDirectory={working_dir} Restart=on-failure RestartSec=10 +KillMode=mixed +KillSignal=SIGTERM +TimeoutStopSec=15 StandardOutput=journal StandardError=journal @@ -377,8 +383,15 @@ def launchd_status(deep: bool = False): # Gateway Runner # ============================================================================= -def run_gateway(verbose: bool = False): - """Run the gateway in foreground.""" +def run_gateway(verbose: bool = False, replace: bool = False): + """Run the gateway in foreground. + + Args: + verbose: Enable verbose logging output. + replace: If True, kill any existing gateway instance before starting. + This prevents systemd restart loops when the old process + hasn't fully exited yet. + """ sys.path.insert(0, str(PROJECT_ROOT)) from gateway.run import start_gateway @@ -393,7 +406,7 @@ def run_gateway(verbose: bool = False): # Exit with code 1 if gateway fails to connect any platform, # so systemd Restart=on-failure will retry on transient errors - success = asyncio.run(start_gateway()) + success = asyncio.run(start_gateway(replace=replace)) if not success: sys.exit(1) @@ -765,7 +778,8 @@ def gateway_command(args): # Default to run if no subcommand if subcmd is None or subcmd == "run": verbose = getattr(args, 'verbose', False) - run_gateway(verbose) + replace = getattr(args, 'replace', False) + run_gateway(verbose, replace=replace) return if subcmd == "setup": diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 78c50468d..55c41e37b 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -1315,6 +1315,8 @@ For more help on a command: # gateway run (default) gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground") gateway_run.add_argument("-v", "--verbose", action="store_true") + gateway_run.add_argument("--replace", action="store_true", + help="Replace any existing gateway instance (useful for systemd)") # gateway start gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")