Merge origin/main into hermes/hermes-5d160594
This commit is contained in:
@@ -161,7 +161,7 @@ class DeliveryRouter:
|
||||
|
||||
# Always include local if configured
|
||||
if self.config.always_log_local:
|
||||
local_key = (Platform.LOCAL, None)
|
||||
local_key = (Platform.LOCAL, None, None)
|
||||
if local_key not in seen_platforms:
|
||||
targets.append(DeliveryTarget(platform=Platform.LOCAL))
|
||||
|
||||
|
||||
@@ -346,6 +346,10 @@ class BasePlatformAdapter(ABC):
|
||||
self.platform = platform
|
||||
self._message_handler: Optional[MessageHandler] = None
|
||||
self._running = False
|
||||
self._fatal_error_code: Optional[str] = None
|
||||
self._fatal_error_message: Optional[str] = None
|
||||
self._fatal_error_retryable = True
|
||||
self._fatal_error_handler: Optional[Callable[["BasePlatformAdapter"], Awaitable[None] | None]] = None
|
||||
|
||||
# Track active message handlers per session for interrupt support
|
||||
# Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
|
||||
@@ -353,6 +357,70 @@ class BasePlatformAdapter(ABC):
|
||||
self._pending_messages: Dict[str, MessageEvent] = {}
|
||||
# Chats where auto-TTS on voice input is disabled (set by /voice off)
|
||||
self._auto_tts_disabled_chats: set = set()
|
||||
|
||||
@property
|
||||
def has_fatal_error(self) -> bool:
|
||||
return self._fatal_error_message is not None
|
||||
|
||||
@property
|
||||
def fatal_error_message(self) -> Optional[str]:
|
||||
return self._fatal_error_message
|
||||
|
||||
@property
|
||||
def fatal_error_code(self) -> Optional[str]:
|
||||
return self._fatal_error_code
|
||||
|
||||
@property
|
||||
def fatal_error_retryable(self) -> bool:
|
||||
return self._fatal_error_retryable
|
||||
|
||||
def set_fatal_error_handler(self, handler: Callable[["BasePlatformAdapter"], Awaitable[None] | None]) -> None:
|
||||
self._fatal_error_handler = handler
|
||||
|
||||
def _mark_connected(self) -> None:
|
||||
self._running = True
|
||||
self._fatal_error_code = None
|
||||
self._fatal_error_message = None
|
||||
self._fatal_error_retryable = True
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(platform=self.platform.value, platform_state="connected", error_code=None, error_message=None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _mark_disconnected(self) -> None:
|
||||
self._running = False
|
||||
if self.has_fatal_error:
|
||||
return
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(platform=self.platform.value, platform_state="disconnected", error_code=None, error_message=None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _set_fatal_error(self, code: str, message: str, *, retryable: bool) -> None:
|
||||
self._running = False
|
||||
self._fatal_error_code = code
|
||||
self._fatal_error_message = message
|
||||
self._fatal_error_retryable = retryable
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(
|
||||
platform=self.platform.value,
|
||||
platform_state="fatal",
|
||||
error_code=code,
|
||||
error_message=message,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def _notify_fatal_error(self) -> None:
|
||||
handler = self._fatal_error_handler
|
||||
if not handler:
|
||||
return
|
||||
result = handler(self)
|
||||
if asyncio.iscoroutine(result):
|
||||
await result
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
|
||||
@@ -105,12 +105,43 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
|
||||
# Telegram message limits
|
||||
MAX_MESSAGE_LENGTH = 4096
|
||||
MEDIA_GROUP_WAIT_SECONDS = 0.8
|
||||
|
||||
def __init__(self, config: PlatformConfig):
|
||||
super().__init__(config, Platform.TELEGRAM)
|
||||
self._app: Optional[Application] = None
|
||||
self._bot: Optional[Bot] = None
|
||||
|
||||
self._media_group_events: Dict[str, MessageEvent] = {}
|
||||
self._media_group_tasks: Dict[str, asyncio.Task] = {}
|
||||
self._token_lock_identity: Optional[str] = None
|
||||
self._polling_error_task: Optional[asyncio.Task] = None
|
||||
|
||||
@staticmethod
|
||||
def _looks_like_polling_conflict(error: Exception) -> bool:
|
||||
text = str(error).lower()
|
||||
return (
|
||||
error.__class__.__name__.lower() == "conflict"
|
||||
or "terminated by other getupdates request" in text
|
||||
or "another bot instance is running" in text
|
||||
)
|
||||
|
||||
async def _handle_polling_conflict(self, error: Exception) -> None:
|
||||
if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict":
|
||||
return
|
||||
message = (
|
||||
"Another Telegram bot poller is already using this token. "
|
||||
"Hermes stopped Telegram polling to avoid endless retry spam. "
|
||||
"Make sure only one gateway instance is running for this bot token."
|
||||
)
|
||||
logger.error("[%s] %s Original error: %s", self.name, message, error)
|
||||
self._set_fatal_error("telegram_polling_conflict", message, retryable=False)
|
||||
try:
|
||||
if self._app and self._app.updater:
|
||||
await self._app.updater.stop()
|
||||
except Exception as stop_error:
|
||||
logger.warning("[%s] Failed stopping Telegram polling after conflict: %s", self.name, stop_error, exc_info=True)
|
||||
await self._notify_fatal_error()
|
||||
|
||||
async def connect(self) -> bool:
|
||||
"""Connect to Telegram and start polling for updates."""
|
||||
if not TELEGRAM_AVAILABLE:
|
||||
@@ -125,6 +156,25 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
return False
|
||||
|
||||
try:
|
||||
from gateway.status import acquire_scoped_lock
|
||||
|
||||
self._token_lock_identity = self.config.token
|
||||
acquired, existing = acquire_scoped_lock(
|
||||
"telegram-bot-token",
|
||||
self._token_lock_identity,
|
||||
metadata={"platform": self.platform.value},
|
||||
)
|
||||
if not acquired:
|
||||
owner_pid = existing.get("pid") if isinstance(existing, dict) else None
|
||||
message = (
|
||||
"Another local Hermes gateway is already using this Telegram bot token"
|
||||
+ (f" (PID {owner_pid})." if owner_pid else ".")
|
||||
+ " Stop the other gateway before starting a second Telegram poller."
|
||||
)
|
||||
logger.error("[%s] %s", self.name, message)
|
||||
self._set_fatal_error("telegram_token_lock", message, retryable=False)
|
||||
return False
|
||||
|
||||
# Build the application
|
||||
self._app = Application.builder().token(self.config.token).build()
|
||||
self._bot = self._app.bot
|
||||
@@ -150,9 +200,20 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
# Start polling in background
|
||||
await self._app.initialize()
|
||||
await self._app.start()
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
def _polling_error_callback(error: Exception) -> None:
|
||||
if not self._looks_like_polling_conflict(error):
|
||||
logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True)
|
||||
return
|
||||
if self._polling_error_task and not self._polling_error_task.done():
|
||||
return
|
||||
self._polling_error_task = loop.create_task(self._handle_polling_conflict(error))
|
||||
|
||||
await self._app.updater.start_polling(
|
||||
allowed_updates=Update.ALL_TYPES,
|
||||
drop_pending_updates=True,
|
||||
error_callback=_polling_error_callback,
|
||||
)
|
||||
|
||||
# Register bot commands so Telegram shows a hint menu when users type /
|
||||
@@ -188,16 +249,30 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
self._running = True
|
||||
self._mark_connected()
|
||||
logger.info("[%s] Connected and polling for Telegram updates", self.name)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
if self._token_lock_identity:
|
||||
try:
|
||||
from gateway.status import release_scoped_lock
|
||||
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
|
||||
except Exception:
|
||||
pass
|
||||
logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
|
||||
return False
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Stop polling and disconnect."""
|
||||
"""Stop polling, cancel pending album flushes, and disconnect."""
|
||||
pending_media_group_tasks = list(self._media_group_tasks.values())
|
||||
for task in pending_media_group_tasks:
|
||||
task.cancel()
|
||||
if pending_media_group_tasks:
|
||||
await asyncio.gather(*pending_media_group_tasks, return_exceptions=True)
|
||||
self._media_group_tasks.clear()
|
||||
self._media_group_events.clear()
|
||||
|
||||
if self._app:
|
||||
try:
|
||||
await self._app.updater.stop()
|
||||
@@ -205,10 +280,17 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
await self._app.shutdown()
|
||||
except Exception as e:
|
||||
logger.warning("[%s] Error during Telegram disconnect: %s", self.name, e, exc_info=True)
|
||||
if self._token_lock_identity:
|
||||
try:
|
||||
from gateway.status import release_scoped_lock
|
||||
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
|
||||
except Exception as e:
|
||||
logger.warning("[%s] Error releasing Telegram token lock: %s", self.name, e, exc_info=True)
|
||||
|
||||
self._running = False
|
||||
self._mark_disconnected()
|
||||
self._app = None
|
||||
self._bot = None
|
||||
self._token_lock_identity = None
|
||||
logger.info("[%s] Disconnected from Telegram", self.name)
|
||||
|
||||
async def send(
|
||||
@@ -872,8 +954,53 @@ class TelegramAdapter(BasePlatformAdapter):
|
||||
except Exception as e:
|
||||
logger.warning("[Telegram] Failed to cache document: %s", e, exc_info=True)
|
||||
|
||||
media_group_id = getattr(msg, "media_group_id", None)
|
||||
if media_group_id:
|
||||
await self._queue_media_group_event(str(media_group_id), event)
|
||||
return
|
||||
|
||||
await self.handle_message(event)
|
||||
|
||||
async def _queue_media_group_event(self, media_group_id: str, event: MessageEvent) -> None:
|
||||
"""Buffer Telegram media-group items so albums arrive as one logical event.
|
||||
|
||||
Telegram delivers albums as multiple updates with a shared media_group_id.
|
||||
If we forward each item immediately, the gateway thinks the second image is a
|
||||
new user message and interrupts the first. We debounce briefly and merge the
|
||||
attachments into a single MessageEvent.
|
||||
"""
|
||||
existing = self._media_group_events.get(media_group_id)
|
||||
if existing is None:
|
||||
self._media_group_events[media_group_id] = event
|
||||
else:
|
||||
existing.media_urls.extend(event.media_urls)
|
||||
existing.media_types.extend(event.media_types)
|
||||
if event.text:
|
||||
if existing.text:
|
||||
if event.text not in existing.text.split("\n\n"):
|
||||
existing.text = f"{existing.text}\n\n{event.text}"
|
||||
else:
|
||||
existing.text = event.text
|
||||
|
||||
prior_task = self._media_group_tasks.get(media_group_id)
|
||||
if prior_task:
|
||||
prior_task.cancel()
|
||||
|
||||
self._media_group_tasks[media_group_id] = asyncio.create_task(
|
||||
self._flush_media_group_event(media_group_id)
|
||||
)
|
||||
|
||||
async def _flush_media_group_event(self, media_group_id: str) -> None:
|
||||
try:
|
||||
await asyncio.sleep(self.MEDIA_GROUP_WAIT_SECONDS)
|
||||
event = self._media_group_events.pop(media_group_id, None)
|
||||
if event is not None:
|
||||
await self.handle_message(event)
|
||||
except asyncio.CancelledError:
|
||||
return
|
||||
finally:
|
||||
self._media_group_tasks.pop(media_group_id, None)
|
||||
|
||||
async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None:
|
||||
"""
|
||||
Describe a Telegram sticker via vision analysis, with caching.
|
||||
|
||||
116
gateway/run.py
116
gateway/run.py
@@ -215,6 +215,33 @@ def _resolve_gateway_model() -> str:
|
||||
return model
|
||||
|
||||
|
||||
def _resolve_hermes_bin() -> Optional[list[str]]:
|
||||
"""Resolve the Hermes update command as argv parts.
|
||||
|
||||
Tries in order:
|
||||
1. ``shutil.which("hermes")`` — standard PATH lookup
|
||||
2. ``sys.executable -m hermes_cli.main`` — fallback when Hermes is running
|
||||
from a venv/module invocation and the ``hermes`` shim is not on PATH
|
||||
|
||||
Returns argv parts ready for quoting/joining, or ``None`` if neither works.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
hermes_bin = shutil.which("hermes")
|
||||
if hermes_bin:
|
||||
return [hermes_bin]
|
||||
|
||||
try:
|
||||
import importlib.util
|
||||
|
||||
if importlib.util.find_spec("hermes_cli") is not None:
|
||||
return [sys.executable, "-m", "hermes_cli.main"]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class GatewayRunner:
|
||||
"""
|
||||
Main gateway controller.
|
||||
@@ -245,6 +272,8 @@ class GatewayRunner:
|
||||
self.delivery_router = DeliveryRouter(self.config)
|
||||
self._running = False
|
||||
self._shutdown_event = asyncio.Event()
|
||||
self._exit_cleanly = False
|
||||
self._exit_reason: Optional[str] = None
|
||||
|
||||
# Track running agents per session for interrupt support
|
||||
# Key: session_key, Value: AIAgent instance
|
||||
@@ -463,6 +492,41 @@ class GatewayRunner:
|
||||
"""Run the sync memory flush in a thread pool so it won't block the event loop."""
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id)
|
||||
|
||||
@property
|
||||
def should_exit_cleanly(self) -> bool:
|
||||
return self._exit_cleanly
|
||||
|
||||
@property
|
||||
def exit_reason(self) -> Optional[str]:
|
||||
return self._exit_reason
|
||||
|
||||
async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
|
||||
"""React to a non-retryable adapter failure after startup."""
|
||||
logger.error(
|
||||
"Fatal %s adapter error (%s): %s",
|
||||
adapter.platform.value,
|
||||
adapter.fatal_error_code or "unknown",
|
||||
adapter.fatal_error_message or "unknown error",
|
||||
)
|
||||
|
||||
existing = self.adapters.get(adapter.platform)
|
||||
if existing is adapter:
|
||||
try:
|
||||
await adapter.disconnect()
|
||||
finally:
|
||||
self.adapters.pop(adapter.platform, None)
|
||||
self.delivery_router.adapters = self.adapters
|
||||
|
||||
if not self.adapters:
|
||||
self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected"
|
||||
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
|
||||
await self.stop()
|
||||
|
||||
def _request_clean_exit(self, reason: str) -> None:
|
||||
self._exit_cleanly = True
|
||||
self._exit_reason = reason
|
||||
self._shutdown_event.set()
|
||||
|
||||
@staticmethod
|
||||
def _load_prefill_messages() -> List[Dict[str, Any]]:
|
||||
@@ -647,6 +711,11 @@ class GatewayRunner:
|
||||
"""
|
||||
logger.info("Starting Hermes Gateway...")
|
||||
logger.info("Session storage: %s", self.config.sessions_dir)
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="starting", exit_reason=None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Warn if no user allowlists are configured and open access is not opted in
|
||||
_any_allowlist = any(
|
||||
@@ -676,6 +745,7 @@ class GatewayRunner:
|
||||
logger.warning("Process checkpoint recovery: %s", e)
|
||||
|
||||
connected_count = 0
|
||||
startup_nonretryable_errors: list[str] = []
|
||||
|
||||
# Initialize and connect each configured platform
|
||||
for platform, platform_config in self.config.platforms.items():
|
||||
@@ -687,8 +757,9 @@ class GatewayRunner:
|
||||
logger.warning("No adapter available for %s", platform.value)
|
||||
continue
|
||||
|
||||
# Set up message handler
|
||||
# Set up message + fatal error handlers
|
||||
adapter.set_message_handler(self._handle_message)
|
||||
adapter.set_fatal_error_handler(self._handle_adapter_fatal_error)
|
||||
|
||||
# Try to connect
|
||||
logger.info("Connecting to %s...", platform.value)
|
||||
@@ -701,10 +772,24 @@ class GatewayRunner:
|
||||
logger.info("✓ %s connected", platform.value)
|
||||
else:
|
||||
logger.warning("✗ %s failed to connect", platform.value)
|
||||
if adapter.has_fatal_error and not adapter.fatal_error_retryable:
|
||||
startup_nonretryable_errors.append(
|
||||
f"{platform.value}: {adapter.fatal_error_message}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("✗ %s error: %s", platform.value, e)
|
||||
|
||||
if connected_count == 0:
|
||||
if startup_nonretryable_errors:
|
||||
reason = "; ".join(startup_nonretryable_errors)
|
||||
logger.error("Gateway hit a non-retryable startup conflict: %s", reason)
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
|
||||
except Exception:
|
||||
pass
|
||||
self._request_clean_exit(reason)
|
||||
return True
|
||||
logger.warning("No messaging platforms connected.")
|
||||
logger.info("Gateway will continue running for cron job execution.")
|
||||
|
||||
@@ -712,6 +797,11 @@ class GatewayRunner:
|
||||
self.delivery_router.adapters = self.adapters
|
||||
|
||||
self._running = True
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="running", exit_reason=None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Emit gateway:startup hook
|
||||
hook_count = len(self.hooks.loaded_hooks)
|
||||
@@ -806,8 +896,12 @@ class GatewayRunner:
|
||||
self._shutdown_all_gateway_honcho()
|
||||
self._shutdown_event.set()
|
||||
|
||||
from gateway.status import remove_pid_file
|
||||
from gateway.status import remove_pid_file, write_runtime_status
|
||||
remove_pid_file()
|
||||
try:
|
||||
write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("Gateway stopped")
|
||||
|
||||
@@ -3155,9 +3249,14 @@ class GatewayRunner:
|
||||
if not git_dir.exists():
|
||||
return "✗ Not a git repository — cannot update."
|
||||
|
||||
hermes_bin = shutil.which("hermes")
|
||||
if not hermes_bin:
|
||||
return "✗ `hermes` command not found on PATH."
|
||||
hermes_cmd = _resolve_hermes_bin()
|
||||
if not hermes_cmd:
|
||||
return (
|
||||
"✗ Could not locate the `hermes` command. "
|
||||
"Hermes is running, but the update command could not find the "
|
||||
"executable on PATH or via the current Python interpreter. "
|
||||
"Try running `hermes update` manually in your terminal."
|
||||
)
|
||||
|
||||
pending_path = _hermes_home / ".update_pending.json"
|
||||
output_path = _hermes_home / ".update_output.txt"
|
||||
@@ -3173,8 +3272,9 @@ class GatewayRunner:
|
||||
|
||||
# Spawn `hermes update` in a separate cgroup so it survives gateway
|
||||
# restart. systemd-run --user --scope creates a transient scope unit.
|
||||
hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd)
|
||||
update_cmd = (
|
||||
f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; "
|
||||
f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; "
|
||||
f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}"
|
||||
)
|
||||
try:
|
||||
@@ -4338,6 +4438,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
||||
success = await runner.start()
|
||||
if not success:
|
||||
return False
|
||||
if runner.should_exit_cleanly:
|
||||
if runner.exit_reason:
|
||||
logger.error("Gateway exiting cleanly: %s", runner.exit_reason)
|
||||
return True
|
||||
|
||||
# Write PID file so CLI can detect gateway is running
|
||||
import atexit
|
||||
|
||||
@@ -11,13 +11,17 @@ that will be useful when we add named profiles (multiple agents running
|
||||
concurrently under distinct configurations).
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
_GATEWAY_KIND = "hermes-gateway"
|
||||
_RUNTIME_STATUS_FILE = "gateway_state.json"
|
||||
_LOCKS_DIRNAME = "gateway-locks"
|
||||
|
||||
|
||||
def _get_pid_path() -> Path:
|
||||
@@ -26,6 +30,32 @@ def _get_pid_path() -> Path:
|
||||
return home / "gateway.pid"
|
||||
|
||||
|
||||
def _get_runtime_status_path() -> Path:
|
||||
"""Return the persisted runtime health/status file path."""
|
||||
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
|
||||
|
||||
|
||||
def _get_lock_dir() -> Path:
|
||||
"""Return the machine-local directory for token-scoped gateway locks."""
|
||||
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
|
||||
if override:
|
||||
return Path(override)
|
||||
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
|
||||
return state_home / "hermes" / _LOCKS_DIRNAME
|
||||
|
||||
|
||||
def _utc_now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _scope_hash(identity: str) -> str:
|
||||
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
def _get_scope_lock_path(scope: str, identity: str) -> Path:
|
||||
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
|
||||
|
||||
|
||||
def _get_process_start_time(pid: int) -> Optional[int]:
|
||||
"""Return the kernel start time for a process when available."""
|
||||
stat_path = Path(f"/proc/{pid}/stat")
|
||||
@@ -73,6 +103,38 @@ def _build_pid_record() -> dict:
|
||||
}
|
||||
|
||||
|
||||
def _build_runtime_status_record() -> dict[str, Any]:
|
||||
payload = _build_pid_record()
|
||||
payload.update({
|
||||
"gateway_state": "starting",
|
||||
"exit_reason": None,
|
||||
"platforms": {},
|
||||
"updated_at": _utc_now_iso(),
|
||||
})
|
||||
return payload
|
||||
|
||||
|
||||
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
raw = path.read_text().strip()
|
||||
except OSError:
|
||||
return None
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return payload if isinstance(payload, dict) else None
|
||||
|
||||
|
||||
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(payload))
|
||||
|
||||
|
||||
def _read_pid_record() -> Optional[dict]:
|
||||
pid_path = _get_pid_path()
|
||||
if not pid_path.exists():
|
||||
@@ -99,9 +161,49 @@ def _read_pid_record() -> Optional[dict]:
|
||||
|
||||
def write_pid_file() -> None:
|
||||
"""Write the current process PID and metadata to the gateway PID file."""
|
||||
pid_path = _get_pid_path()
|
||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pid_path.write_text(json.dumps(_build_pid_record()))
|
||||
_write_json_file(_get_pid_path(), _build_pid_record())
|
||||
|
||||
|
||||
def write_runtime_status(
|
||||
*,
|
||||
gateway_state: Optional[str] = None,
|
||||
exit_reason: Optional[str] = None,
|
||||
platform: Optional[str] = None,
|
||||
platform_state: Optional[str] = None,
|
||||
error_code: Optional[str] = None,
|
||||
error_message: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Persist gateway runtime health information for diagnostics/status."""
|
||||
path = _get_runtime_status_path()
|
||||
payload = _read_json_file(path) or _build_runtime_status_record()
|
||||
payload.setdefault("platforms", {})
|
||||
payload.setdefault("kind", _GATEWAY_KIND)
|
||||
payload.setdefault("pid", os.getpid())
|
||||
payload.setdefault("start_time", _get_process_start_time(os.getpid()))
|
||||
payload["updated_at"] = _utc_now_iso()
|
||||
|
||||
if gateway_state is not None:
|
||||
payload["gateway_state"] = gateway_state
|
||||
if exit_reason is not None:
|
||||
payload["exit_reason"] = exit_reason
|
||||
|
||||
if platform is not None:
|
||||
platform_payload = payload["platforms"].get(platform, {})
|
||||
if platform_state is not None:
|
||||
platform_payload["state"] = platform_state
|
||||
if error_code is not None:
|
||||
platform_payload["error_code"] = error_code
|
||||
if error_message is not None:
|
||||
platform_payload["error_message"] = error_message
|
||||
platform_payload["updated_at"] = _utc_now_iso()
|
||||
payload["platforms"][platform] = platform_payload
|
||||
|
||||
_write_json_file(path, payload)
|
||||
|
||||
|
||||
def read_runtime_status() -> Optional[dict[str, Any]]:
|
||||
"""Read the persisted gateway runtime health/status information."""
|
||||
return _read_json_file(_get_runtime_status_path())
|
||||
|
||||
|
||||
def remove_pid_file() -> None:
|
||||
@@ -112,6 +214,87 @@ def remove_pid_file() -> None:
|
||||
pass
|
||||
|
||||
|
||||
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
|
||||
"""Acquire a machine-local lock keyed by scope + identity.
|
||||
|
||||
Used to prevent multiple local gateways from using the same external identity
|
||||
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
|
||||
"""
|
||||
lock_path = _get_scope_lock_path(scope, identity)
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record = {
|
||||
**_build_pid_record(),
|
||||
"scope": scope,
|
||||
"identity_hash": _scope_hash(identity),
|
||||
"metadata": metadata or {},
|
||||
"updated_at": _utc_now_iso(),
|
||||
}
|
||||
|
||||
existing = _read_json_file(lock_path)
|
||||
if existing:
|
||||
try:
|
||||
existing_pid = int(existing["pid"])
|
||||
except (KeyError, TypeError, ValueError):
|
||||
existing_pid = None
|
||||
|
||||
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
|
||||
_write_json_file(lock_path, record)
|
||||
return True, existing
|
||||
|
||||
stale = existing_pid is None
|
||||
if not stale:
|
||||
try:
|
||||
os.kill(existing_pid, 0)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
stale = True
|
||||
else:
|
||||
current_start = _get_process_start_time(existing_pid)
|
||||
if (
|
||||
existing.get("start_time") is not None
|
||||
and current_start is not None
|
||||
and current_start != existing.get("start_time")
|
||||
):
|
||||
stale = True
|
||||
if stale:
|
||||
try:
|
||||
lock_path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
else:
|
||||
return False, existing
|
||||
|
||||
try:
|
||||
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||
except FileExistsError:
|
||||
return False, _read_json_file(lock_path)
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as handle:
|
||||
json.dump(record, handle)
|
||||
except Exception:
|
||||
try:
|
||||
lock_path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
return True, None
|
||||
|
||||
|
||||
def release_scoped_lock(scope: str, identity: str) -> None:
|
||||
"""Release a previously-acquired scope lock when owned by this process."""
|
||||
lock_path = _get_scope_lock_path(scope, identity)
|
||||
existing = _read_json_file(lock_path)
|
||||
if not existing:
|
||||
return
|
||||
if existing.get("pid") != os.getpid():
|
||||
return
|
||||
if existing.get("start_time") != _get_process_start_time(os.getpid()):
|
||||
return
|
||||
try:
|
||||
lock_path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def get_running_pid() -> Optional[int]:
|
||||
"""Return the PID of a running gateway instance, or ``None``.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user