Merge origin/main into hermes/hermes-5d160594

This commit is contained in:
teknium1
2026-03-14 19:34:05 -07:00
78 changed files with 3762 additions and 395 deletions

View File

@@ -161,7 +161,7 @@ class DeliveryRouter:
# Always include local if configured
if self.config.always_log_local:
local_key = (Platform.LOCAL, None)
local_key = (Platform.LOCAL, None, None)
if local_key not in seen_platforms:
targets.append(DeliveryTarget(platform=Platform.LOCAL))

View File

@@ -346,6 +346,10 @@ class BasePlatformAdapter(ABC):
self.platform = platform
self._message_handler: Optional[MessageHandler] = None
self._running = False
self._fatal_error_code: Optional[str] = None
self._fatal_error_message: Optional[str] = None
self._fatal_error_retryable = True
self._fatal_error_handler: Optional[Callable[["BasePlatformAdapter"], Awaitable[None] | None]] = None
# Track active message handlers per session for interrupt support
# Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
@@ -353,6 +357,70 @@ class BasePlatformAdapter(ABC):
self._pending_messages: Dict[str, MessageEvent] = {}
# Chats where auto-TTS on voice input is disabled (set by /voice off)
self._auto_tts_disabled_chats: set = set()
@property
def has_fatal_error(self) -> bool:
return self._fatal_error_message is not None
@property
def fatal_error_message(self) -> Optional[str]:
return self._fatal_error_message
@property
def fatal_error_code(self) -> Optional[str]:
return self._fatal_error_code
@property
def fatal_error_retryable(self) -> bool:
return self._fatal_error_retryable
def set_fatal_error_handler(self, handler: Callable[["BasePlatformAdapter"], Awaitable[None] | None]) -> None:
self._fatal_error_handler = handler
def _mark_connected(self) -> None:
self._running = True
self._fatal_error_code = None
self._fatal_error_message = None
self._fatal_error_retryable = True
try:
from gateway.status import write_runtime_status
write_runtime_status(platform=self.platform.value, platform_state="connected", error_code=None, error_message=None)
except Exception:
pass
def _mark_disconnected(self) -> None:
self._running = False
if self.has_fatal_error:
return
try:
from gateway.status import write_runtime_status
write_runtime_status(platform=self.platform.value, platform_state="disconnected", error_code=None, error_message=None)
except Exception:
pass
def _set_fatal_error(self, code: str, message: str, *, retryable: bool) -> None:
self._running = False
self._fatal_error_code = code
self._fatal_error_message = message
self._fatal_error_retryable = retryable
try:
from gateway.status import write_runtime_status
write_runtime_status(
platform=self.platform.value,
platform_state="fatal",
error_code=code,
error_message=message,
)
except Exception:
pass
async def _notify_fatal_error(self) -> None:
handler = self._fatal_error_handler
if not handler:
return
result = handler(self)
if asyncio.iscoroutine(result):
await result
@property
def name(self) -> str:

View File

@@ -105,12 +105,43 @@ class TelegramAdapter(BasePlatformAdapter):
# Telegram message limits
MAX_MESSAGE_LENGTH = 4096
MEDIA_GROUP_WAIT_SECONDS = 0.8
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.TELEGRAM)
self._app: Optional[Application] = None
self._bot: Optional[Bot] = None
self._media_group_events: Dict[str, MessageEvent] = {}
self._media_group_tasks: Dict[str, asyncio.Task] = {}
self._token_lock_identity: Optional[str] = None
self._polling_error_task: Optional[asyncio.Task] = None
@staticmethod
def _looks_like_polling_conflict(error: Exception) -> bool:
text = str(error).lower()
return (
error.__class__.__name__.lower() == "conflict"
or "terminated by other getupdates request" in text
or "another bot instance is running" in text
)
async def _handle_polling_conflict(self, error: Exception) -> None:
if self.has_fatal_error and self.fatal_error_code == "telegram_polling_conflict":
return
message = (
"Another Telegram bot poller is already using this token. "
"Hermes stopped Telegram polling to avoid endless retry spam. "
"Make sure only one gateway instance is running for this bot token."
)
logger.error("[%s] %s Original error: %s", self.name, message, error)
self._set_fatal_error("telegram_polling_conflict", message, retryable=False)
try:
if self._app and self._app.updater:
await self._app.updater.stop()
except Exception as stop_error:
logger.warning("[%s] Failed stopping Telegram polling after conflict: %s", self.name, stop_error, exc_info=True)
await self._notify_fatal_error()
async def connect(self) -> bool:
"""Connect to Telegram and start polling for updates."""
if not TELEGRAM_AVAILABLE:
@@ -125,6 +156,25 @@ class TelegramAdapter(BasePlatformAdapter):
return False
try:
from gateway.status import acquire_scoped_lock
self._token_lock_identity = self.config.token
acquired, existing = acquire_scoped_lock(
"telegram-bot-token",
self._token_lock_identity,
metadata={"platform": self.platform.value},
)
if not acquired:
owner_pid = existing.get("pid") if isinstance(existing, dict) else None
message = (
"Another local Hermes gateway is already using this Telegram bot token"
+ (f" (PID {owner_pid})." if owner_pid else ".")
+ " Stop the other gateway before starting a second Telegram poller."
)
logger.error("[%s] %s", self.name, message)
self._set_fatal_error("telegram_token_lock", message, retryable=False)
return False
# Build the application
self._app = Application.builder().token(self.config.token).build()
self._bot = self._app.bot
@@ -150,9 +200,20 @@ class TelegramAdapter(BasePlatformAdapter):
# Start polling in background
await self._app.initialize()
await self._app.start()
loop = asyncio.get_running_loop()
def _polling_error_callback(error: Exception) -> None:
if not self._looks_like_polling_conflict(error):
logger.error("[%s] Telegram polling error: %s", self.name, error, exc_info=True)
return
if self._polling_error_task and not self._polling_error_task.done():
return
self._polling_error_task = loop.create_task(self._handle_polling_conflict(error))
await self._app.updater.start_polling(
allowed_updates=Update.ALL_TYPES,
drop_pending_updates=True,
error_callback=_polling_error_callback,
)
# Register bot commands so Telegram shows a hint menu when users type /
@@ -188,16 +249,30 @@ class TelegramAdapter(BasePlatformAdapter):
exc_info=True,
)
self._running = True
self._mark_connected()
logger.info("[%s] Connected and polling for Telegram updates", self.name)
return True
except Exception as e:
if self._token_lock_identity:
try:
from gateway.status import release_scoped_lock
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
except Exception:
pass
logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
return False
async def disconnect(self) -> None:
"""Stop polling and disconnect."""
"""Stop polling, cancel pending album flushes, and disconnect."""
pending_media_group_tasks = list(self._media_group_tasks.values())
for task in pending_media_group_tasks:
task.cancel()
if pending_media_group_tasks:
await asyncio.gather(*pending_media_group_tasks, return_exceptions=True)
self._media_group_tasks.clear()
self._media_group_events.clear()
if self._app:
try:
await self._app.updater.stop()
@@ -205,10 +280,17 @@ class TelegramAdapter(BasePlatformAdapter):
await self._app.shutdown()
except Exception as e:
logger.warning("[%s] Error during Telegram disconnect: %s", self.name, e, exc_info=True)
if self._token_lock_identity:
try:
from gateway.status import release_scoped_lock
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
except Exception as e:
logger.warning("[%s] Error releasing Telegram token lock: %s", self.name, e, exc_info=True)
self._running = False
self._mark_disconnected()
self._app = None
self._bot = None
self._token_lock_identity = None
logger.info("[%s] Disconnected from Telegram", self.name)
async def send(
@@ -872,8 +954,53 @@ class TelegramAdapter(BasePlatformAdapter):
except Exception as e:
logger.warning("[Telegram] Failed to cache document: %s", e, exc_info=True)
media_group_id = getattr(msg, "media_group_id", None)
if media_group_id:
await self._queue_media_group_event(str(media_group_id), event)
return
await self.handle_message(event)
async def _queue_media_group_event(self, media_group_id: str, event: MessageEvent) -> None:
"""Buffer Telegram media-group items so albums arrive as one logical event.
Telegram delivers albums as multiple updates with a shared media_group_id.
If we forward each item immediately, the gateway thinks the second image is a
new user message and interrupts the first. We debounce briefly and merge the
attachments into a single MessageEvent.
"""
existing = self._media_group_events.get(media_group_id)
if existing is None:
self._media_group_events[media_group_id] = event
else:
existing.media_urls.extend(event.media_urls)
existing.media_types.extend(event.media_types)
if event.text:
if existing.text:
if event.text not in existing.text.split("\n\n"):
existing.text = f"{existing.text}\n\n{event.text}"
else:
existing.text = event.text
prior_task = self._media_group_tasks.get(media_group_id)
if prior_task:
prior_task.cancel()
self._media_group_tasks[media_group_id] = asyncio.create_task(
self._flush_media_group_event(media_group_id)
)
async def _flush_media_group_event(self, media_group_id: str) -> None:
try:
await asyncio.sleep(self.MEDIA_GROUP_WAIT_SECONDS)
event = self._media_group_events.pop(media_group_id, None)
if event is not None:
await self.handle_message(event)
except asyncio.CancelledError:
return
finally:
self._media_group_tasks.pop(media_group_id, None)
async def _handle_sticker(self, msg: Message, event: "MessageEvent") -> None:
"""
Describe a Telegram sticker via vision analysis, with caching.

View File

@@ -215,6 +215,33 @@ def _resolve_gateway_model() -> str:
return model
def _resolve_hermes_bin() -> Optional[list[str]]:
"""Resolve the Hermes update command as argv parts.
Tries in order:
1. ``shutil.which("hermes")`` — standard PATH lookup
2. ``sys.executable -m hermes_cli.main`` — fallback when Hermes is running
from a venv/module invocation and the ``hermes`` shim is not on PATH
Returns argv parts ready for quoting/joining, or ``None`` if neither works.
"""
import shutil
hermes_bin = shutil.which("hermes")
if hermes_bin:
return [hermes_bin]
try:
import importlib.util
if importlib.util.find_spec("hermes_cli") is not None:
return [sys.executable, "-m", "hermes_cli.main"]
except Exception:
pass
return None
class GatewayRunner:
"""
Main gateway controller.
@@ -245,6 +272,8 @@ class GatewayRunner:
self.delivery_router = DeliveryRouter(self.config)
self._running = False
self._shutdown_event = asyncio.Event()
self._exit_cleanly = False
self._exit_reason: Optional[str] = None
# Track running agents per session for interrupt support
# Key: session_key, Value: AIAgent instance
@@ -463,6 +492,41 @@ class GatewayRunner:
"""Run the sync memory flush in a thread pool so it won't block the event loop."""
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id)
@property
def should_exit_cleanly(self) -> bool:
return self._exit_cleanly
@property
def exit_reason(self) -> Optional[str]:
return self._exit_reason
async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
"""React to a non-retryable adapter failure after startup."""
logger.error(
"Fatal %s adapter error (%s): %s",
adapter.platform.value,
adapter.fatal_error_code or "unknown",
adapter.fatal_error_message or "unknown error",
)
existing = self.adapters.get(adapter.platform)
if existing is adapter:
try:
await adapter.disconnect()
finally:
self.adapters.pop(adapter.platform, None)
self.delivery_router.adapters = self.adapters
if not self.adapters:
self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected"
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
await self.stop()
def _request_clean_exit(self, reason: str) -> None:
self._exit_cleanly = True
self._exit_reason = reason
self._shutdown_event.set()
@staticmethod
def _load_prefill_messages() -> List[Dict[str, Any]]:
@@ -647,6 +711,11 @@ class GatewayRunner:
"""
logger.info("Starting Hermes Gateway...")
logger.info("Session storage: %s", self.config.sessions_dir)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="starting", exit_reason=None)
except Exception:
pass
# Warn if no user allowlists are configured and open access is not opted in
_any_allowlist = any(
@@ -676,6 +745,7 @@ class GatewayRunner:
logger.warning("Process checkpoint recovery: %s", e)
connected_count = 0
startup_nonretryable_errors: list[str] = []
# Initialize and connect each configured platform
for platform, platform_config in self.config.platforms.items():
@@ -687,8 +757,9 @@ class GatewayRunner:
logger.warning("No adapter available for %s", platform.value)
continue
# Set up message handler
# Set up message + fatal error handlers
adapter.set_message_handler(self._handle_message)
adapter.set_fatal_error_handler(self._handle_adapter_fatal_error)
# Try to connect
logger.info("Connecting to %s...", platform.value)
@@ -701,10 +772,24 @@ class GatewayRunner:
logger.info("%s connected", platform.value)
else:
logger.warning("%s failed to connect", platform.value)
if adapter.has_fatal_error and not adapter.fatal_error_retryable:
startup_nonretryable_errors.append(
f"{platform.value}: {adapter.fatal_error_message}"
)
except Exception as e:
logger.error("%s error: %s", platform.value, e)
if connected_count == 0:
if startup_nonretryable_errors:
reason = "; ".join(startup_nonretryable_errors)
logger.error("Gateway hit a non-retryable startup conflict: %s", reason)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
self._request_clean_exit(reason)
return True
logger.warning("No messaging platforms connected.")
logger.info("Gateway will continue running for cron job execution.")
@@ -712,6 +797,11 @@ class GatewayRunner:
self.delivery_router.adapters = self.adapters
self._running = True
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="running", exit_reason=None)
except Exception:
pass
# Emit gateway:startup hook
hook_count = len(self.hooks.loaded_hooks)
@@ -806,8 +896,12 @@ class GatewayRunner:
self._shutdown_all_gateway_honcho()
self._shutdown_event.set()
from gateway.status import remove_pid_file
from gateway.status import remove_pid_file, write_runtime_status
remove_pid_file()
try:
write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason)
except Exception:
pass
logger.info("Gateway stopped")
@@ -3155,9 +3249,14 @@ class GatewayRunner:
if not git_dir.exists():
return "✗ Not a git repository — cannot update."
hermes_bin = shutil.which("hermes")
if not hermes_bin:
return "✗ `hermes` command not found on PATH."
hermes_cmd = _resolve_hermes_bin()
if not hermes_cmd:
return (
"✗ Could not locate the `hermes` command. "
"Hermes is running, but the update command could not find the "
"executable on PATH or via the current Python interpreter. "
"Try running `hermes update` manually in your terminal."
)
pending_path = _hermes_home / ".update_pending.json"
output_path = _hermes_home / ".update_output.txt"
@@ -3173,8 +3272,9 @@ class GatewayRunner:
# Spawn `hermes update` in a separate cgroup so it survives gateway
# restart. systemd-run --user --scope creates a transient scope unit.
hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd)
update_cmd = (
f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; "
f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; "
f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}"
)
try:
@@ -4338,6 +4438,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
success = await runner.start()
if not success:
return False
if runner.should_exit_cleanly:
if runner.exit_reason:
logger.error("Gateway exiting cleanly: %s", runner.exit_reason)
return True
# Write PID file so CLI can detect gateway is running
import atexit

View File

@@ -11,13 +11,17 @@ that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
"""
import hashlib
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from typing import Any, Optional
_GATEWAY_KIND = "hermes-gateway"
_RUNTIME_STATUS_FILE = "gateway_state.json"
_LOCKS_DIRNAME = "gateway-locks"
def _get_pid_path() -> Path:
@@ -26,6 +30,32 @@ def _get_pid_path() -> Path:
return home / "gateway.pid"
def _get_runtime_status_path() -> Path:
"""Return the persisted runtime health/status file path."""
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
def _get_lock_dir() -> Path:
"""Return the machine-local directory for token-scoped gateway locks."""
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
if override:
return Path(override)
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
return state_home / "hermes" / _LOCKS_DIRNAME
def _utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _scope_hash(identity: str) -> str:
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
def _get_scope_lock_path(scope: str, identity: str) -> Path:
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
stat_path = Path(f"/proc/{pid}/stat")
@@ -73,6 +103,38 @@ def _build_pid_record() -> dict:
}
def _build_runtime_status_record() -> dict[str, Any]:
payload = _build_pid_record()
payload.update({
"gateway_state": "starting",
"exit_reason": None,
"platforms": {},
"updated_at": _utc_now_iso(),
})
return payload
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
if not path.exists():
return None
try:
raw = path.read_text().strip()
except OSError:
return None
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
return None
return payload if isinstance(payload, dict) else None
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload))
def _read_pid_record() -> Optional[dict]:
pid_path = _get_pid_path()
if not pid_path.exists():
@@ -99,9 +161,49 @@ def _read_pid_record() -> Optional[dict]:
def write_pid_file() -> None:
"""Write the current process PID and metadata to the gateway PID file."""
pid_path = _get_pid_path()
pid_path.parent.mkdir(parents=True, exist_ok=True)
pid_path.write_text(json.dumps(_build_pid_record()))
_write_json_file(_get_pid_path(), _build_pid_record())
def write_runtime_status(
*,
gateway_state: Optional[str] = None,
exit_reason: Optional[str] = None,
platform: Optional[str] = None,
platform_state: Optional[str] = None,
error_code: Optional[str] = None,
error_message: Optional[str] = None,
) -> None:
"""Persist gateway runtime health information for diagnostics/status."""
path = _get_runtime_status_path()
payload = _read_json_file(path) or _build_runtime_status_record()
payload.setdefault("platforms", {})
payload.setdefault("kind", _GATEWAY_KIND)
payload.setdefault("pid", os.getpid())
payload.setdefault("start_time", _get_process_start_time(os.getpid()))
payload["updated_at"] = _utc_now_iso()
if gateway_state is not None:
payload["gateway_state"] = gateway_state
if exit_reason is not None:
payload["exit_reason"] = exit_reason
if platform is not None:
platform_payload = payload["platforms"].get(platform, {})
if platform_state is not None:
platform_payload["state"] = platform_state
if error_code is not None:
platform_payload["error_code"] = error_code
if error_message is not None:
platform_payload["error_message"] = error_message
platform_payload["updated_at"] = _utc_now_iso()
payload["platforms"][platform] = platform_payload
_write_json_file(path, payload)
def read_runtime_status() -> Optional[dict[str, Any]]:
"""Read the persisted gateway runtime health/status information."""
return _read_json_file(_get_runtime_status_path())
def remove_pid_file() -> None:
@@ -112,6 +214,87 @@ def remove_pid_file() -> None:
pass
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
"""Acquire a machine-local lock keyed by scope + identity.
Used to prevent multiple local gateways from using the same external identity
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
"""
lock_path = _get_scope_lock_path(scope, identity)
lock_path.parent.mkdir(parents=True, exist_ok=True)
record = {
**_build_pid_record(),
"scope": scope,
"identity_hash": _scope_hash(identity),
"metadata": metadata or {},
"updated_at": _utc_now_iso(),
}
existing = _read_json_file(lock_path)
if existing:
try:
existing_pid = int(existing["pid"])
except (KeyError, TypeError, ValueError):
existing_pid = None
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
_write_json_file(lock_path, record)
return True, existing
stale = existing_pid is None
if not stale:
try:
os.kill(existing_pid, 0)
except (ProcessLookupError, PermissionError):
stale = True
else:
current_start = _get_process_start_time(existing_pid)
if (
existing.get("start_time") is not None
and current_start is not None
and current_start != existing.get("start_time")
):
stale = True
if stale:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
else:
return False, existing
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
except FileExistsError:
return False, _read_json_file(lock_path)
try:
with os.fdopen(fd, "w", encoding="utf-8") as handle:
json.dump(record, handle)
except Exception:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
raise
return True, None
def release_scoped_lock(scope: str, identity: str) -> None:
"""Release a previously-acquired scope lock when owned by this process."""
lock_path = _get_scope_lock_path(scope, identity)
existing = _read_json_file(lock_path)
if not existing:
return
if existing.get("pid") != os.getpid():
return
if existing.get("start_time") != _get_process_start_time(os.getpid()):
return
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.