Merge origin/main into hermes/hermes-5d160594

2026-03-14 19:34:05 -07:00
parent 2536ff328b 81cd367aec
commit 3229e434b8
78 changed files with 3762 additions and 395 deletions
--- a/gateway/delivery.py
+++ b/gateway/delivery.py
@@ -161,7 +161,7 @@ class DeliveryRouter:
        
        # Always include local if configured
        if self.config.always_log_local:
-            local_key = (Platform.LOCAL, None)
+            local_key = (Platform.LOCAL, None, None)
            if local_key not in seen_platforms:
                targets.append(DeliveryTarget(platform=Platform.LOCAL))
        
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -346,6 +346,10 @@ class BasePlatformAdapter(ABC):
        self.platform = platform
        self._message_handler: Optional[MessageHandler] = None
        self._running = False
+        self._fatal_error_code: Optional[str] = None
+        self._fatal_error_message: Optional[str] = None
+        self._fatal_error_retryable = True
+        self._fatal_error_handler: Optional[Callable[["BasePlatformAdapter"], Awaitable[None] | None]] = None
        
        # Track active message handlers per session for interrupt support
        # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
@@ -353,6 +357,70 @@ class BasePlatformAdapter(ABC):
        self._pending_messages: Dict[str, MessageEvent] = {}
        # Chats where auto-TTS on voice input is disabled (set by /voice off)
        self._auto_tts_disabled_chats: set = set()
+
+    @property
+    def has_fatal_error(self) -> bool:
+        return self._fatal_error_message is not None
+
+    @property
+    def fatal_error_message(self) -> Optional[str]:
+        return self._fatal_error_message
+
+    @property
+    def fatal_error_code(self) -> Optional[str]:
+        return self._fatal_error_code
+
+    @property
+    def fatal_error_retryable(self) -> bool:
+        return self._fatal_error_retryable
+
+    def set_fatal_error_handler(self, handler: Callable[["BasePlatformAdapter"], Awaitable[None] | None]) -> None:
+        self._fatal_error_handler = handler
+
+    def _mark_connected(self) -> None:
+        self._running = True
+        self._fatal_error_code = None
+        self._fatal_error_message = None
+        self._fatal_error_retryable = True
+        try:
+            from gateway.status import write_runtime_status
+            write_runtime_status(platform=self.platform.value, platform_state="connected", error_code=None, error_message=None)
+        except Exception:
+            pass
+
+    def _mark_disconnected(self) -> None:
+        self._running = False
+        if self.has_fatal_error:
+            return
+        try:
+            from gateway.status import write_runtime_status
+            write_runtime_status(platform=self.platform.value, platform_state="disconnected", error_code=None, error_message=None)
+        except Exception:
+            pass
+
+    def _set_fatal_error(self, code: str, message: str, *, retryable: bool) -> None:
+        self._running = False
+        self._fatal_error_code = code
+        self._fatal_error_message = message
+        self._fatal_error_retryable = retryable
+        try:
+            from gateway.status import write_runtime_status
+            write_runtime_status(
+                platform=self.platform.value,
+                platform_state="fatal",
+                error_code=code,
+                error_message=message,
+            )
+        except Exception:
+            pass
+
+    async def _notify_fatal_error(self) -> None:
+        handler = self._fatal_error_handler
+        if not handler:
+            return
+        result = handler(self)
+        if asyncio.iscoroutine(result):
+            await result
    
    @property
    def name(self) -> str:
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@@ -105,12 +105,43 @@ class TelegramAdapter(BasePlatformAdapter):
    
    # Telegram message limits
    MAX_MESSAGE_LENGTH = 4096
+    MEDIA_GROUP_WAIT_SECONDS = 0.8
    
    def __init__(self, config: PlatformConfig):
        super().__init__(config, Platform.TELEGRAM)
        self._app: Optional[Application] = None
        self._bot: Optional[Bot] = None
-    
+        self._media_group_events: Dict[str, MessageEvent] = {}
+        self._media_group_tasks: Dict[str, asyncio.Task] = {}
+        self._token_lock_identity: Optional[str] = None
+        self._polling_error_task: Optional[asyncio.Task] = None
+
+    @staticmethod
+    def _looks_like_polling_conflict(error: Exception) -> bool:
+        text = str(error).lower()
+        return (
+            error.__class__.__name__.lower() == "conflict"
+            or "terminated by other getupdates request" in text
+            or "another bot instance is running" in text
+        )
+
+    async def _handle_polling_conflict(self, error: Exception) -> None:
+        if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict":
+            return
+        message = (
+            "Another Telegram bot poller is already using this token. "
+            "Hermes stopped Telegram polling to avoid endless retry spam. "
+            "Make sure only one gateway instance is running for this bot token."
+        )
+        logger.error("[%s] %s Original error: %s", self.name, message, error)
+        self._set_fatal_error("telegram_polling_conflict", message, retryable=False)
+        try:
+            if self._app and self._app.updater:
+                await self._app.updater.stop()
+        except Exception as stop_error:
+            logger.warning("[%s] Failed stopping Telegram polling after conflict: %s", self.name, stop_error, exc_info=True)
+        await self._notify_fatal_error()
+
    async def connect(self) -> bool:
        """Connect to Telegram and start polling for updates."""
        if not TELEGRAM_AVAILABLE:
@@ -125,6 +156,25 @@ class TelegramAdapter(BasePlatformAdapter):
            return False
        
        try:
+            from gateway.status import acquire_scoped_lock
+
+            self._token_lock_identity = self.config.token
+            acquired, existing = acquire_scoped_lock(
+                "telegram-bot-token",
+                self._token_lock_identity,
+                metadata={"platform": self.platform.value},
+            )
+            if not acquired:
+                owner_pid = existing.get("pid") if isinstance(existing, dict) else None
+                message = (
+                    "Another local Hermes gateway is already using this Telegram bot token"
+                    + (f" (PID {owner_pid})." if owner_pid else ".")
+                    + " Stop the other gateway before starting a second Telegram poller."
+                )
+                logger.error("[%s] %s", self.name, message)
+                self._set_fatal_error("telegram_token_lock", message, retryable=False)
+                return False
+
            # Build the application
            self._app = Application.builder().token(self.config.token).build()
            self._bot = self._app.bot
@@ -150,9 +200,20 @@ class TelegramAdapter(BasePlatformAdapter):
            # Start polling in background
            await self._app.initialize()
            await self._app.start()
+            loop = asyncio.get_running_loop()
+
+            def _polling_error_callback(error: Exception) -> None:
+                if not self._looks_like_polling_conflict(error):
+                    logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True)
+                    return
+                if self._polling_error_task and not self._polling_error_task.done():
+                    return
+                self._polling_error_task = loop.create_task(self._handle_polling_conflict(error))
+
            await self._app.updater.start_polling(
                allowed_updates=Update.ALL_TYPES,
                drop_pending_updates=True,
+                error_callback=_polling_error_callback,
            )
            
            # Register bot commands so Telegram shows a hint menu when users type /
@@ -188,16 +249,30 @@ class TelegramAdapter(BasePlatformAdapter):
                    exc_info=True,
                )
            
-            self._running = True
+            self._mark_connected()
            logger.info("[%s] Connected and polling for Telegram updates", self.name)
            return True
            
        except Exception as e:
+            if self._token_lock_identity:
+                try:
+                    from gateway.status import release_scoped_lock
+                    release_scoped_lock("telegram-bot-token", self._token_lock_identity)
+                except Exception:
+                    pass
            logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
            return False
    
    async def disconnect(self) -> None:
-        """Stop polling and disconnect."""
+        """Stop polling, cancel pending album flushes, and disconnect."""
+        pending_media_group_tasks = list(self._media_group_tasks.values())
+        for task in pending_media_group_tasks:
+            task.cancel()
+        if pending_media_group_tasks:
+            await asyncio.gather(*pending_media_group_tasks, return_exceptions=True)
+        self._media_group_tasks.clear()
+        self._media_group_events.clear()
+
        if self._app:
            try:
                await self._app.updater.stop()
@@ -205,10 +280,17 @@ class TelegramAdapter(BasePlatformAdapter):
                await self._app.shutdown()
            except Exception as e:
                logger.warning("[%s] Error during Telegram disconnect: %s", self.name, e, exc_info=True)
+        if self._token_lock_identity:
+            try:
+                from gateway.status import release_scoped_lock
+                release_scoped_lock("telegram-bot-token", self._token_lock_identity)
+            except Exception as e:
+                logger.warning("[%s] Error releasing Telegram token lock: %s", self.name, e, exc_info=True)
        
-        self._running = False
+        self._mark_disconnected()
        self._app = None
        self._bot = None
+        self._token_lock_identity = None
        logger.info("[%s] Disconnected from Telegram", self.name)
    
    async def send(
@@ -872,8 +954,53 @@ class TelegramAdapter(BasePlatformAdapter):
            except Exception as e:
                logger.warning("[Telegram] Failed to cache document: %s", e, exc_info=True)

+        media_group_id = getattr(msg, "media_group_id", None)
+        if media_group_id:
+            await self._queue_media_group_event(str(media_group_id), event)
+            return
+
        await self.handle_message(event)
    
+    async def _queue_media_group_event(self, media_group_id: str, event: MessageEvent) -> None:
+        """Buffer Telegram media-group items so albums arrive as one logical event.
+
+        Telegram delivers albums as multiple updates with a shared media_group_id.
+        If we forward each item immediately, the gateway thinks the second image is a
+        new user message and interrupts the first. We debounce briefly and merge the
+        attachments into a single MessageEvent.
+        """
+        existing = self._media_group_events.get(media_group_id)
+        if existing is None:
+            self._media_group_events[media_group_id] = event
+        else:
+            existing.media_urls.extend(event.media_urls)
+            existing.media_types.extend(event.media_types)
+            if event.text:
+                if existing.text:
+                    if event.text not in existing.text.split("\n\n"):
+                        existing.text = f"{existing.text}\n\n{event.text}"
+                else:
+                    existing.text = event.text
+
+        prior_task = self._media_group_tasks.get(media_group_id)
+        if prior_task:
+            prior_task.cancel()
+
+        self._media_group_tasks[media_group_id] = asyncio.create_task(
+            self._flush_media_group_event(media_group_id)
+        )
+
+    async def _flush_media_group_event(self, media_group_id: str) -> None:
+        try:
+            await asyncio.sleep(self.MEDIA_GROUP_WAIT_SECONDS)
+            event = self._media_group_events.pop(media_group_id, None)
+            if event is not None:
+                await self.handle_message(event)
+        except asyncio.CancelledError:
+            return
+        finally:
+            self._media_group_tasks.pop(media_group_id, None)
+
    async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None:
        """
        Describe a Telegram sticker via vision analysis, with caching.
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -215,6 +215,33 @@ def _resolve_gateway_model() -> str:
    return model


+def _resolve_hermes_bin() -> Optional[list[str]]:
+    """Resolve the Hermes update command as argv parts.
+
+    Tries in order:
+    1. ``shutil.which("hermes")`` — standard PATH lookup
+    2. ``sys.executable -m hermes_cli.main`` — fallback when Hermes is running
+       from a venv/module invocation and the ``hermes`` shim is not on PATH
+
+    Returns argv parts ready for quoting/joining, or ``None`` if neither works.
+    """
+    import shutil
+
+    hermes_bin = shutil.which("hermes")
+    if hermes_bin:
+        return [hermes_bin]
+
+    try:
+        import importlib.util
+
+        if importlib.util.find_spec("hermes_cli") is not None:
+            return [sys.executable, "-m", "hermes_cli.main"]
+    except Exception:
+        pass
+
+    return None
+
+
 class GatewayRunner:
    """
    Main gateway controller.
@@ -245,6 +272,8 @@ class GatewayRunner:
        self.delivery_router = DeliveryRouter(self.config)
        self._running = False
        self._shutdown_event = asyncio.Event()
+        self._exit_cleanly = False
+        self._exit_reason: Optional[str] = None
        
        # Track running agents per session for interrupt support
        # Key: session_key, Value: AIAgent instance
@@ -463,6 +492,41 @@ class GatewayRunner:
        """Run the sync memory flush in a thread pool so it won't block the event loop."""
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id)
+
+    @property
+    def should_exit_cleanly(self) -> bool:
+        return self._exit_cleanly
+
+    @property
+    def exit_reason(self) -> Optional[str]:
+        return self._exit_reason
+
+    async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
+        """React to a non-retryable adapter failure after startup."""
+        logger.error(
+            "Fatal %s adapter error (%s): %s",
+            adapter.platform.value,
+            adapter.fatal_error_code or "unknown",
+            adapter.fatal_error_message or "unknown error",
+        )
+
+        existing = self.adapters.get(adapter.platform)
+        if existing is adapter:
+            try:
+                await adapter.disconnect()
+            finally:
+                self.adapters.pop(adapter.platform, None)
+                self.delivery_router.adapters = self.adapters
+
+        if not self.adapters:
+            self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected"
+            logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
+            await self.stop()
+
+    def _request_clean_exit(self, reason: str) -> None:
+        self._exit_cleanly = True
+        self._exit_reason = reason
+        self._shutdown_event.set()
    
    @staticmethod
    def _load_prefill_messages() -> List[Dict[str, Any]]:
@@ -647,6 +711,11 @@ class GatewayRunner:
        """
        logger.info("Starting Hermes Gateway...")
        logger.info("Session storage: %s", self.config.sessions_dir)
+        try:
+            from gateway.status import write_runtime_status
+            write_runtime_status(gateway_state="starting", exit_reason=None)
+        except Exception:
+            pass
        
        # Warn if no user allowlists are configured and open access is not opted in
        _any_allowlist = any(
@@ -676,6 +745,7 @@ class GatewayRunner:
            logger.warning("Process checkpoint recovery: %s", e)
        
        connected_count = 0
+        startup_nonretryable_errors: list[str] = []
        
        # Initialize and connect each configured platform
        for platform, platform_config in self.config.platforms.items():
@@ -687,8 +757,9 @@ class GatewayRunner:
                logger.warning("No adapter available for %s", platform.value)
                continue
            
-            # Set up message handler
+            # Set up message + fatal error handlers
            adapter.set_message_handler(self._handle_message)
+            adapter.set_fatal_error_handler(self._handle_adapter_fatal_error)
            
            # Try to connect
            logger.info("Connecting to %s...", platform.value)
@@ -701,10 +772,24 @@ class GatewayRunner:
                    logger.info("✓ %s connected", platform.value)
                else:
                    logger.warning("✗ %s failed to connect", platform.value)
+                    if adapter.has_fatal_error and not adapter.fatal_error_retryable:
+                        startup_nonretryable_errors.append(
+                            f"{platform.value}: {adapter.fatal_error_message}"
+                        )
            except Exception as e:
                logger.error("✗ %s error: %s", platform.value, e)
        
        if connected_count == 0:
+            if startup_nonretryable_errors:
+                reason = "; ".join(startup_nonretryable_errors)
+                logger.error("Gateway hit a non-retryable startup conflict: %s", reason)
+                try:
+                    from gateway.status import write_runtime_status
+                    write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
+                except Exception:
+                    pass
+                self._request_clean_exit(reason)
+                return True
            logger.warning("No messaging platforms connected.")
            logger.info("Gateway will continue running for cron job execution.")
        
@@ -712,6 +797,11 @@ class GatewayRunner:
        self.delivery_router.adapters = self.adapters
        
        self._running = True
+        try:
+            from gateway.status import write_runtime_status
+            write_runtime_status(gateway_state="running", exit_reason=None)
+        except Exception:
+            pass
        
        # Emit gateway:startup hook
        hook_count = len(self.hooks.loaded_hooks)
@@ -806,8 +896,12 @@ class GatewayRunner:
        self._shutdown_all_gateway_honcho()
        self._shutdown_event.set()
        
-        from gateway.status import remove_pid_file
+        from gateway.status import remove_pid_file, write_runtime_status
        remove_pid_file()
+        try:
+            write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason)
+        except Exception:
+            pass
        
        logger.info("Gateway stopped")
    
@@ -3155,9 +3249,14 @@ class GatewayRunner:
        if not git_dir.exists():
            return "✗ Not a git repository — cannot update."

-        hermes_bin = shutil.which("hermes")
-        if not hermes_bin:
-            return "✗ `hermes` command not found on PATH."
+        hermes_cmd = _resolve_hermes_bin()
+        if not hermes_cmd:
+            return (
+                "✗ Could not locate the `hermes` command. "
+                "Hermes is running, but the update command could not find the "
+                "executable on PATH or via the current Python interpreter. "
+                "Try running `hermes update` manually in your terminal."
+            )

        pending_path = _hermes_home / ".update_pending.json"
        output_path = _hermes_home / ".update_output.txt"
@@ -3173,8 +3272,9 @@ class GatewayRunner:

        # Spawn `hermes update` in a separate cgroup so it survives gateway
        # restart. systemd-run --user --scope creates a transient scope unit.
+        hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd)
        update_cmd = (
-            f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; "
+            f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; "
            f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}"
        )
        try:
@@ -4338,6 +4438,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
    success = await runner.start()
    if not success:
        return False
+    if runner.should_exit_cleanly:
+        if runner.exit_reason:
+            logger.error("Gateway exiting cleanly: %s", runner.exit_reason)
+        return True
    
    # Write PID file so CLI can detect gateway is running
    import atexit
--- a/gateway/status.py
+++ b/gateway/status.py
@@ -11,13 +11,17 @@ that will be useful when we add named profiles (multiple agents running
 concurrently under distinct configurations).
 """

+import hashlib
 import json
 import os
 import sys
+from datetime import datetime, timezone
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional

 _GATEWAY_KIND = "hermes-gateway"
+_RUNTIME_STATUS_FILE = "gateway_state.json"
+_LOCKS_DIRNAME = "gateway-locks"


 def _get_pid_path() -> Path:
@@ -26,6 +30,32 @@ def _get_pid_path() -> Path:
    return home / "gateway.pid"


+def _get_runtime_status_path() -> Path:
+    """Return the persisted runtime health/status file path."""
+    return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
+
+
+def _get_lock_dir() -> Path:
+    """Return the machine-local directory for token-scoped gateway locks."""
+    override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
+    if override:
+        return Path(override)
+    state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
+    return state_home / "hermes" / _LOCKS_DIRNAME
+
+
+def _utc_now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _scope_hash(identity: str) -> str:
+    return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
+
+
+def _get_scope_lock_path(scope: str, identity: str) -> Path:
+    return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
+
+
 def _get_process_start_time(pid: int) -> Optional[int]:
    """Return the kernel start time for a process when available."""
    stat_path = Path(f"/proc/{pid}/stat")
@@ -73,6 +103,38 @@ def _build_pid_record() -> dict:
    }


+def _build_runtime_status_record() -> dict[str, Any]:
+    payload = _build_pid_record()
+    payload.update({
+        "gateway_state": "starting",
+        "exit_reason": None,
+        "platforms": {},
+        "updated_at": _utc_now_iso(),
+    })
+    return payload
+
+
+def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
+    if not path.exists():
+        return None
+    try:
+        raw = path.read_text().strip()
+    except OSError:
+        return None
+    if not raw:
+        return None
+    try:
+        payload = json.loads(raw)
+    except json.JSONDecodeError:
+        return None
+    return payload if isinstance(payload, dict) else None
+
+
+def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload))
+
+
 def _read_pid_record() -> Optional[dict]:
    pid_path = _get_pid_path()
    if not pid_path.exists():
@@ -99,9 +161,49 @@ def _read_pid_record() -> Optional[dict]:

 def write_pid_file() -> None:
    """Write the current process PID and metadata to the gateway PID file."""
-    pid_path = _get_pid_path()
-    pid_path.parent.mkdir(parents=True, exist_ok=True)
-    pid_path.write_text(json.dumps(_build_pid_record()))
+    _write_json_file(_get_pid_path(), _build_pid_record())
+
+
+def write_runtime_status(
+    *,
+    gateway_state: Optional[str] = None,
+    exit_reason: Optional[str] = None,
+    platform: Optional[str] = None,
+    platform_state: Optional[str] = None,
+    error_code: Optional[str] = None,
+    error_message: Optional[str] = None,
+) -> None:
+    """Persist gateway runtime health information for diagnostics/status."""
+    path = _get_runtime_status_path()
+    payload = _read_json_file(path) or _build_runtime_status_record()
+    payload.setdefault("platforms", {})
+    payload.setdefault("kind", _GATEWAY_KIND)
+    payload.setdefault("pid", os.getpid())
+    payload.setdefault("start_time", _get_process_start_time(os.getpid()))
+    payload["updated_at"] = _utc_now_iso()
+
+    if gateway_state is not None:
+        payload["gateway_state"] = gateway_state
+    if exit_reason is not None:
+        payload["exit_reason"] = exit_reason
+
+    if platform is not None:
+        platform_payload = payload["platforms"].get(platform, {})
+        if platform_state is not None:
+            platform_payload["state"] = platform_state
+        if error_code is not None:
+            platform_payload["error_code"] = error_code
+        if error_message is not None:
+            platform_payload["error_message"] = error_message
+        platform_payload["updated_at"] = _utc_now_iso()
+        payload["platforms"][platform] = platform_payload
+
+    _write_json_file(path, payload)
+
+
+def read_runtime_status() -> Optional[dict[str, Any]]:
+    """Read the persisted gateway runtime health/status information."""
+    return _read_json_file(_get_runtime_status_path())


 def remove_pid_file() -> None:
@@ -112,6 +214,87 @@ def remove_pid_file() -> None:
        pass


+def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
+    """Acquire a machine-local lock keyed by scope + identity.
+
+    Used to prevent multiple local gateways from using the same external identity
+    at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
+    """
+    lock_path = _get_scope_lock_path(scope, identity)
+    lock_path.parent.mkdir(parents=True, exist_ok=True)
+    record = {
+        **_build_pid_record(),
+        "scope": scope,
+        "identity_hash": _scope_hash(identity),
+        "metadata": metadata or {},
+        "updated_at": _utc_now_iso(),
+    }
+
+    existing = _read_json_file(lock_path)
+    if existing:
+        try:
+            existing_pid = int(existing["pid"])
+        except (KeyError, TypeError, ValueError):
+            existing_pid = None
+
+        if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
+            _write_json_file(lock_path, record)
+            return True, existing
+
+        stale = existing_pid is None
+        if not stale:
+            try:
+                os.kill(existing_pid, 0)
+            except (ProcessLookupError, PermissionError):
+                stale = True
+            else:
+                current_start = _get_process_start_time(existing_pid)
+                if (
+                    existing.get("start_time") is not None
+                    and current_start is not None
+                    and current_start != existing.get("start_time")
+                ):
+                    stale = True
+        if stale:
+            try:
+                lock_path.unlink(missing_ok=True)
+            except OSError:
+                pass
+        else:
+            return False, existing
+
+    try:
+        fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
+    except FileExistsError:
+        return False, _read_json_file(lock_path)
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as handle:
+            json.dump(record, handle)
+    except Exception:
+        try:
+            lock_path.unlink(missing_ok=True)
+        except OSError:
+            pass
+        raise
+    return True, None
+
+
+def release_scoped_lock(scope: str, identity: str) -> None:
+    """Release a previously-acquired scope lock when owned by this process."""
+    lock_path = _get_scope_lock_path(scope, identity)
+    existing = _read_json_file(lock_path)
+    if not existing:
+        return
+    if existing.get("pid") != os.getpid():
+        return
+    if existing.get("start_time") != _get_process_start_time(os.getpid()):
+        return
+    try:
+        lock_path.unlink(missing_ok=True)
+    except OSError:
+        pass
+
+
 def get_running_pid() -> Optional[int]:
    """Return the PID of a running gateway instance, or ``None``.