- detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever
- add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once
- persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration
Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled)
Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago
Invocation: 8879379b25994201b98381f4bd80c2af
Main PID: 1147926 (python)
Tasks: 16 (limit: 76757)
Memory: 151.4M (peak: 168.1M)
CPU: 47.883s
CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service
├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace
└─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat
Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)...
Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data]
Mar 14 09:27:04 teknium-dev python[1147926]: Content: ''
Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded.
Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data]
Mar 14 09:27:07 teknium-dev python[1147926]: Content: ''
Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)...
Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data]
Mar 14 09:27:12 teknium-dev python[1147926]: Content: ''
Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)...
⚠ Installed gateway service definition is outdated
Run: hermes gateway restart # auto-refreshes the unit
✓ Gateway service is running
✓ Systemd linger is enabled (service survives logout)
- cleanly exit non-retryable startup conflicts without triggering service restart loops
Tests:
- gateway status runtime-state helpers
- Telegram token-lock and polling-conflict behavior
- GatewayRunner clean exit on non-retryable startup conflict
- CLI runtime health summary
337 lines
10 KiB
Python
337 lines
10 KiB
Python
"""
|
|
Gateway runtime status helpers.
|
|
|
|
Provides PID-file based detection of whether the gateway daemon is running,
|
|
used by send_message's check_fn to gate availability in the CLI.
|
|
|
|
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
|
|
``~/.hermes`` but can be overridden via the environment variable. This means
|
|
separate HERMES_HOME directories naturally get separate PID files — a property
|
|
that will be useful when we add named profiles (multiple agents running
|
|
concurrently under distinct configurations).
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
_GATEWAY_KIND = "hermes-gateway"
|
|
_RUNTIME_STATUS_FILE = "gateway_state.json"
|
|
_LOCKS_DIRNAME = "gateway-locks"
|
|
|
|
|
|
def _get_pid_path() -> Path:
|
|
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
|
home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
|
return home / "gateway.pid"
|
|
|
|
|
|
def _get_runtime_status_path() -> Path:
|
|
"""Return the persisted runtime health/status file path."""
|
|
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
|
|
|
|
|
|
def _get_lock_dir() -> Path:
|
|
"""Return the machine-local directory for token-scoped gateway locks."""
|
|
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
|
|
if override:
|
|
return Path(override)
|
|
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
|
|
return state_home / "hermes" / _LOCKS_DIRNAME
|
|
|
|
|
|
def _utc_now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def _scope_hash(identity: str) -> str:
|
|
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
|
|
|
|
|
|
def _get_scope_lock_path(scope: str, identity: str) -> Path:
|
|
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
|
|
|
|
|
|
def _get_process_start_time(pid: int) -> Optional[int]:
|
|
"""Return the kernel start time for a process when available."""
|
|
stat_path = Path(f"/proc/{pid}/stat")
|
|
try:
|
|
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
|
return int(stat_path.read_text().split()[21])
|
|
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
|
return None
|
|
|
|
|
|
def _read_process_cmdline(pid: int) -> Optional[str]:
|
|
"""Return the process command line as a space-separated string."""
|
|
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
|
try:
|
|
raw = cmdline_path.read_bytes()
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
return None
|
|
|
|
if not raw:
|
|
return None
|
|
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
|
|
|
|
|
def _looks_like_gateway_process(pid: int) -> bool:
|
|
"""Return True when the live PID still looks like the Hermes gateway."""
|
|
cmdline = _read_process_cmdline(pid)
|
|
if not cmdline:
|
|
# If we cannot inspect the process, fall back to the liveness check.
|
|
return True
|
|
|
|
patterns = (
|
|
"hermes_cli.main gateway",
|
|
"hermes gateway",
|
|
"gateway/run.py",
|
|
)
|
|
return any(pattern in cmdline for pattern in patterns)
|
|
|
|
|
|
def _build_pid_record() -> dict:
|
|
return {
|
|
"pid": os.getpid(),
|
|
"kind": _GATEWAY_KIND,
|
|
"argv": list(sys.argv),
|
|
"start_time": _get_process_start_time(os.getpid()),
|
|
}
|
|
|
|
|
|
def _build_runtime_status_record() -> dict[str, Any]:
|
|
payload = _build_pid_record()
|
|
payload.update({
|
|
"gateway_state": "starting",
|
|
"exit_reason": None,
|
|
"platforms": {},
|
|
"updated_at": _utc_now_iso(),
|
|
})
|
|
return payload
|
|
|
|
|
|
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
|
|
if not path.exists():
|
|
return None
|
|
try:
|
|
raw = path.read_text().strip()
|
|
except OSError:
|
|
return None
|
|
if not raw:
|
|
return None
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return payload if isinstance(payload, dict) else None
|
|
|
|
|
|
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(json.dumps(payload))
|
|
|
|
|
|
def _read_pid_record() -> Optional[dict]:
|
|
pid_path = _get_pid_path()
|
|
if not pid_path.exists():
|
|
return None
|
|
|
|
raw = pid_path.read_text().strip()
|
|
if not raw:
|
|
return None
|
|
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
try:
|
|
return {"pid": int(raw)}
|
|
except ValueError:
|
|
return None
|
|
|
|
if isinstance(payload, int):
|
|
return {"pid": payload}
|
|
if isinstance(payload, dict):
|
|
return payload
|
|
return None
|
|
|
|
|
|
def write_pid_file() -> None:
|
|
"""Write the current process PID and metadata to the gateway PID file."""
|
|
_write_json_file(_get_pid_path(), _build_pid_record())
|
|
|
|
|
|
def write_runtime_status(
|
|
*,
|
|
gateway_state: Optional[str] = None,
|
|
exit_reason: Optional[str] = None,
|
|
platform: Optional[str] = None,
|
|
platform_state: Optional[str] = None,
|
|
error_code: Optional[str] = None,
|
|
error_message: Optional[str] = None,
|
|
) -> None:
|
|
"""Persist gateway runtime health information for diagnostics/status."""
|
|
path = _get_runtime_status_path()
|
|
payload = _read_json_file(path) or _build_runtime_status_record()
|
|
payload.setdefault("platforms", {})
|
|
payload.setdefault("kind", _GATEWAY_KIND)
|
|
payload.setdefault("pid", os.getpid())
|
|
payload.setdefault("start_time", _get_process_start_time(os.getpid()))
|
|
payload["updated_at"] = _utc_now_iso()
|
|
|
|
if gateway_state is not None:
|
|
payload["gateway_state"] = gateway_state
|
|
if exit_reason is not None:
|
|
payload["exit_reason"] = exit_reason
|
|
|
|
if platform is not None:
|
|
platform_payload = payload["platforms"].get(platform, {})
|
|
if platform_state is not None:
|
|
platform_payload["state"] = platform_state
|
|
if error_code is not None:
|
|
platform_payload["error_code"] = error_code
|
|
if error_message is not None:
|
|
platform_payload["error_message"] = error_message
|
|
platform_payload["updated_at"] = _utc_now_iso()
|
|
payload["platforms"][platform] = platform_payload
|
|
|
|
_write_json_file(path, payload)
|
|
|
|
|
|
def read_runtime_status() -> Optional[dict[str, Any]]:
|
|
"""Read the persisted gateway runtime health/status information."""
|
|
return _read_json_file(_get_runtime_status_path())
|
|
|
|
|
|
def remove_pid_file() -> None:
|
|
"""Remove the gateway PID file if it exists."""
|
|
try:
|
|
_get_pid_path().unlink(missing_ok=True)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
|
|
"""Acquire a machine-local lock keyed by scope + identity.
|
|
|
|
Used to prevent multiple local gateways from using the same external identity
|
|
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
|
|
"""
|
|
lock_path = _get_scope_lock_path(scope, identity)
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
record = {
|
|
**_build_pid_record(),
|
|
"scope": scope,
|
|
"identity_hash": _scope_hash(identity),
|
|
"metadata": metadata or {},
|
|
"updated_at": _utc_now_iso(),
|
|
}
|
|
|
|
existing = _read_json_file(lock_path)
|
|
if existing:
|
|
try:
|
|
existing_pid = int(existing["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
existing_pid = None
|
|
|
|
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
|
|
_write_json_file(lock_path, record)
|
|
return True, existing
|
|
|
|
stale = existing_pid is None
|
|
if not stale:
|
|
try:
|
|
os.kill(existing_pid, 0)
|
|
except (ProcessLookupError, PermissionError):
|
|
stale = True
|
|
else:
|
|
current_start = _get_process_start_time(existing_pid)
|
|
if (
|
|
existing.get("start_time") is not None
|
|
and current_start is not None
|
|
and current_start != existing.get("start_time")
|
|
):
|
|
stale = True
|
|
if stale:
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
else:
|
|
return False, existing
|
|
|
|
try:
|
|
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
|
except FileExistsError:
|
|
return False, _read_json_file(lock_path)
|
|
try:
|
|
with os.fdopen(fd, "w", encoding="utf-8") as handle:
|
|
json.dump(record, handle)
|
|
except Exception:
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
raise
|
|
return True, None
|
|
|
|
|
|
def release_scoped_lock(scope: str, identity: str) -> None:
|
|
"""Release a previously-acquired scope lock when owned by this process."""
|
|
lock_path = _get_scope_lock_path(scope, identity)
|
|
existing = _read_json_file(lock_path)
|
|
if not existing:
|
|
return
|
|
if existing.get("pid") != os.getpid():
|
|
return
|
|
if existing.get("start_time") != _get_process_start_time(os.getpid()):
|
|
return
|
|
try:
|
|
lock_path.unlink(missing_ok=True)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def get_running_pid() -> Optional[int]:
|
|
"""Return the PID of a running gateway instance, or ``None``.
|
|
|
|
Checks the PID file and verifies the process is actually alive.
|
|
Cleans up stale PID files automatically.
|
|
"""
|
|
record = _read_pid_record()
|
|
if not record:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
pid = int(record["pid"])
|
|
except (KeyError, TypeError, ValueError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
try:
|
|
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
|
except (ProcessLookupError, PermissionError):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
recorded_start = record.get("start_time")
|
|
current_start = _get_process_start_time(pid)
|
|
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
|
remove_pid_file()
|
|
return None
|
|
|
|
if not _looks_like_gateway_process(pid):
|
|
remove_pid_file()
|
|
return None
|
|
|
|
return pid
|
|
|
|
|
|
def is_gateway_running() -> bool:
|
|
"""Check if the gateway daemon is currently running."""
|
|
return get_running_pid() is not None
|