fix(gateway): detect stopped processes and release stale locks on --replace

This commit is contained in:
Teknium
2026-03-21 18:13:53 -07:00
parent 29d0541ac9
commit 4bded44b6a
2 changed files with 44 additions and 0 deletions

View File

@@ -5173,6 +5173,16 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
except (ProcessLookupError, PermissionError):
pass
remove_pid_file()
# Also release all scoped locks left by the old process.
# Stopped (Ctrl+Z) processes don't release locks on exit,
# leaving stale lock files that block the new gateway from starting.
try:
from gateway.status import release_all_scoped_locks
_released = release_all_scoped_locks()
if _released:
logger.info("Released %d stale scoped lock(s) from old gateway.", _released)
except Exception:
pass
else:
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
logger.error(

View File

@@ -274,6 +274,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
and current_start != existing.get("start_time")
):
stale = True
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
# processes still respond to os.kill(pid, 0) but are not
# actually running. Treat them as stale so --replace works.
if not stale:
try:
_proc_status = Path(f"/proc/{existing_pid}/status")
if _proc_status.exists():
for _line in _proc_status.read_text().splitlines():
if _line.startswith("State:"):
_state = _line.split()[1]
if _state in ("T", "t"): # stopped or tracing stop
stale = True
break
except (OSError, PermissionError):
pass
if stale:
try:
lock_path.unlink(missing_ok=True)
@@ -314,6 +329,25 @@ def release_scoped_lock(scope: str, identity: str) -> None:
pass
def release_all_scoped_locks() -> int:
"""Remove all scoped lock files in the lock directory.
Called during --replace to clean up stale locks left by stopped/killed
gateway processes that did not release their locks gracefully.
Returns the number of lock files removed.
"""
lock_dir = _get_lock_dir()
removed = 0
if lock_dir.exists():
for lock_file in lock_dir.glob("*.lock"):
try:
lock_file.unlink(missing_ok=True)
removed += 1
except OSError:
pass
return removed
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.