From 4bded44b6aaf9ad0f35ebc2720b44a56e888864c Mon Sep 17 00:00:00 2001 From: Teknium Date: Sat, 21 Mar 2026 18:13:53 -0700 Subject: [PATCH] fix(gateway): detect stopped processes and release stale locks on --replace --- gateway/run.py | 10 ++++++++++ gateway/status.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/gateway/run.py b/gateway/run.py index 814757529..5bb8d6825 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5173,6 +5173,16 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = except (ProcessLookupError, PermissionError): pass remove_pid_file() + # Also release all scoped locks left by the old process. + # Stopped (Ctrl+Z) processes don't release locks on exit, + # leaving stale lock files that block the new gateway from starting. + try: + from gateway.status import release_all_scoped_locks + _released = release_all_scoped_locks() + if _released: + logger.info("Released %d stale scoped lock(s) from old gateway.", _released) + except Exception: + pass else: hermes_home = os.getenv("HERMES_HOME", "~/.hermes") logger.error( diff --git a/gateway/status.py b/gateway/status.py index 72a19a56e..f5f5649b5 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -274,6 +274,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, and current_start != existing.get("start_time") ): stale = True + # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped + # processes still respond to os.kill(pid, 0) but are not + # actually running. Treat them as stale so --replace works. + if not stale: + try: + _proc_status = Path(f"/proc/{existing_pid}/status") + if _proc_status.exists(): + for _line in _proc_status.read_text().splitlines(): + if _line.startswith("State:"): + _state = _line.split()[1] + if _state in ("T", "t"): # stopped or tracing stop + stale = True + break + except (OSError, PermissionError): + pass if stale: try: lock_path.unlink(missing_ok=True) @@ -314,6 +329,25 @@ def release_scoped_lock(scope: str, identity: str) -> None: pass +def release_all_scoped_locks() -> int: + """Remove all scoped lock files in the lock directory. + + Called during --replace to clean up stale locks left by stopped/killed + gateway processes that did not release their locks gracefully. + Returns the number of lock files removed. + """ + lock_dir = _get_lock_dir() + removed = 0 + if lock_dir.exists(): + for lock_file in lock_dir.glob("*.lock"): + try: + lock_file.unlink(missing_ok=True) + removed += 1 + except OSError: + pass + return removed + + def get_running_pid() -> Optional[int]: """Return the PID of a running gateway instance, or ``None``.