Files
hermes-agent/gateway/status.py

392 lines
12 KiB
Python
Raw Permalink Normal View History

"""
Gateway runtime status helpers.
Provides PID-file based detection of whether the gateway daemon is running,
used by send_message's check_fn to gate availability in the CLI.
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
``~/.hermes`` but can be overridden via the environment variable. This means
separate HERMES_HOME directories naturally get separate PID files a property
that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
"""
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
import hashlib
import json
import os
import sys
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
from datetime import datetime, timezone
from pathlib import Path
from hermes_constants import get_hermes_home
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
from typing import Any, Optional
_GATEWAY_KIND = "hermes-gateway"
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
_RUNTIME_STATUS_FILE = "gateway_state.json"
_LOCKS_DIRNAME = "gateway-locks"
def _get_pid_path() -> Path:
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
home = get_hermes_home()
return home / "gateway.pid"
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
def _get_runtime_status_path() -> Path:
"""Return the persisted runtime health/status file path."""
return _get_pid_path().with_name(_RUNTIME_STATUS_FILE)
def _get_lock_dir() -> Path:
"""Return the machine-local directory for token-scoped gateway locks."""
override = os.getenv("HERMES_GATEWAY_LOCK_DIR")
if override:
return Path(override)
state_home = Path(os.getenv("XDG_STATE_HOME", Path.home() / ".local" / "state"))
return state_home / "hermes" / _LOCKS_DIRNAME
def _utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _scope_hash(identity: str) -> str:
return hashlib.sha256(identity.encode("utf-8")).hexdigest()[:16]
def _get_scope_lock_path(scope: str, identity: str) -> Path:
return _get_lock_dir() / f"{scope}-{_scope_hash(identity)}.lock"
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
stat_path = Path(f"/proc/{pid}/stat")
try:
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
return int(stat_path.read_text().split()[21])
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
return None
def _read_process_cmdline(pid: int) -> Optional[str]:
"""Return the process command line as a space-separated string."""
cmdline_path = Path(f"/proc/{pid}/cmdline")
try:
raw = cmdline_path.read_bytes()
except (FileNotFoundError, PermissionError, OSError):
return None
if not raw:
return None
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
def _looks_like_gateway_process(pid: int) -> bool:
"""Return True when the live PID still looks like the Hermes gateway."""
cmdline = _read_process_cmdline(pid)
if not cmdline:
return False
patterns = (
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
"""Validate gateway identity from PID-file metadata when cmdline is unavailable."""
if record.get("kind") != _GATEWAY_KIND:
return False
argv = record.get("argv")
if not isinstance(argv, list) or not argv:
return False
cmdline = " ".join(str(part) for part in argv)
patterns = (
"hermes_cli.main gateway",
"hermes_cli/main.py gateway",
"hermes gateway",
"gateway/run.py",
)
return any(pattern in cmdline for pattern in patterns)
def _build_pid_record() -> dict:
return {
"pid": os.getpid(),
"kind": _GATEWAY_KIND,
"argv": list(sys.argv),
"start_time": _get_process_start_time(os.getpid()),
}
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
def _build_runtime_status_record() -> dict[str, Any]:
payload = _build_pid_record()
payload.update({
"gateway_state": "starting",
"exit_reason": None,
"platforms": {},
"updated_at": _utc_now_iso(),
})
return payload
def _read_json_file(path: Path) -> Optional[dict[str, Any]]:
if not path.exists():
return None
try:
raw = path.read_text().strip()
except OSError:
return None
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
return None
return payload if isinstance(payload, dict) else None
def _write_json_file(path: Path, payload: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload))
def _read_pid_record() -> Optional[dict]:
pid_path = _get_pid_path()
if not pid_path.exists():
return None
raw = pid_path.read_text().strip()
if not raw:
return None
try:
payload = json.loads(raw)
except json.JSONDecodeError:
try:
return {"pid": int(raw)}
except ValueError:
return None
if isinstance(payload, int):
return {"pid": payload}
if isinstance(payload, dict):
return payload
return None
def write_pid_file() -> None:
"""Write the current process PID and metadata to the gateway PID file."""
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
_write_json_file(_get_pid_path(), _build_pid_record())
def write_runtime_status(
*,
gateway_state: Optional[str] = None,
exit_reason: Optional[str] = None,
platform: Optional[str] = None,
platform_state: Optional[str] = None,
error_code: Optional[str] = None,
error_message: Optional[str] = None,
) -> None:
"""Persist gateway runtime health information for diagnostics/status."""
path = _get_runtime_status_path()
payload = _read_json_file(path) or _build_runtime_status_record()
payload.setdefault("platforms", {})
payload.setdefault("kind", _GATEWAY_KIND)
payload["pid"] = os.getpid()
payload["start_time"] = _get_process_start_time(os.getpid())
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
payload["updated_at"] = _utc_now_iso()
if gateway_state is not None:
payload["gateway_state"] = gateway_state
if exit_reason is not None:
payload["exit_reason"] = exit_reason
if platform is not None:
platform_payload = payload["platforms"].get(platform, {})
if platform_state is not None:
platform_payload["state"] = platform_state
if error_code is not None:
platform_payload["error_code"] = error_code
if error_message is not None:
platform_payload["error_message"] = error_message
platform_payload["updated_at"] = _utc_now_iso()
payload["platforms"][platform] = platform_payload
_write_json_file(path, payload)
def read_runtime_status() -> Optional[dict[str, Any]]:
"""Read the persisted gateway runtime health/status information."""
return _read_json_file(_get_runtime_status_path())
def remove_pid_file() -> None:
"""Remove the gateway PID file if it exists."""
try:
_get_pid_path().unlink(missing_ok=True)
except Exception:
pass
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, Any]] = None) -> tuple[bool, Optional[dict[str, Any]]]:
"""Acquire a machine-local lock keyed by scope + identity.
Used to prevent multiple local gateways from using the same external identity
at once (e.g. the same Telegram bot token across different HERMES_HOME dirs).
"""
lock_path = _get_scope_lock_path(scope, identity)
lock_path.parent.mkdir(parents=True, exist_ok=True)
record = {
**_build_pid_record(),
"scope": scope,
"identity_hash": _scope_hash(identity),
"metadata": metadata or {},
"updated_at": _utc_now_iso(),
}
existing = _read_json_file(lock_path)
if existing:
try:
existing_pid = int(existing["pid"])
except (KeyError, TypeError, ValueError):
existing_pid = None
if existing_pid == os.getpid() and existing.get("start_time") == record.get("start_time"):
_write_json_file(lock_path, record)
return True, existing
stale = existing_pid is None
if not stale:
try:
os.kill(existing_pid, 0)
except (ProcessLookupError, PermissionError):
stale = True
else:
current_start = _get_process_start_time(existing_pid)
if (
existing.get("start_time") is not None
and current_start is not None
and current_start != existing.get("start_time")
):
stale = True
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
# processes still respond to os.kill(pid, 0) but are not
# actually running. Treat them as stale so --replace works.
if not stale:
try:
_proc_status = Path(f"/proc/{existing_pid}/status")
if _proc_status.exists():
for _line in _proc_status.read_text().splitlines():
if _line.startswith("State:"):
_state = _line.split()[1]
if _state in ("T", "t"): # stopped or tracing stop
stale = True
break
except (OSError, PermissionError):
pass
fix(gateway): harden Telegram polling conflict handling - detect Telegram getUpdates conflicts and stop polling cleanly instead of retry-spamming forever - add a machine-local token-scoped lock so different HERMES_HOME profiles on the same host can't poll the same bot token at once - persist gateway runtime health/fatal adapter state and surface it in ● hermes-gateway.service - Hermes Agent Gateway - Messaging Platform Integration Loaded: loaded (/home/teknium/.config/systemd/user/hermes-gateway.service; enabled; preset: enabled) Active: active (running) since Sat 2026-03-14 09:25:35 PDT; 2h 45min ago Invocation: 8879379b25994201b98381f4bd80c2af Main PID: 1147926 (python) Tasks: 16 (limit: 76757) Memory: 151.4M (peak: 168.1M) CPU: 47.883s CGroup: /user.slice/user-1000.slice/user@1000.service/app.slice/hermes-gateway.service ├─1147926 /home/teknium/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run --replace └─1147966 node /home/teknium/.hermes/hermes-agent/scripts/whatsapp-bridge/bridge.js --port 3000 --session /home/teknium/.hermes/whatsapp/session --mode self-chat Mar 14 09:27:03 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... Mar 14 09:27:04 teknium-dev python[1147926]: [409B blob data] Mar 14 09:27:04 teknium-dev python[1147926]: Content: '' Mar 14 09:27:04 teknium-dev python[1147926]: ❌ Max retries (3) for empty content exceeded. Mar 14 09:27:07 teknium-dev python[1147926]: [1K blob data] Mar 14 09:27:07 teknium-dev python[1147926]: Content: '' Mar 14 09:27:07 teknium-dev python[1147926]: 🔄 Retrying API call (1/3)... Mar 14 09:27:12 teknium-dev python[1147926]: [1.7K blob data] Mar 14 09:27:12 teknium-dev python[1147926]: Content: '' Mar 14 09:27:12 teknium-dev python[1147926]: 🔄 Retrying API call (2/3)... ⚠ Installed gateway service definition is outdated Run: hermes gateway restart # auto-refreshes the unit ✓ Gateway service is running ✓ Systemd linger is enabled (service survives logout) - cleanly exit non-retryable startup conflicts without triggering service restart loops Tests: - gateway status runtime-state helpers - Telegram token-lock and polling-conflict behavior - GatewayRunner clean exit on non-retryable startup conflict - CLI runtime health summary
2026-03-14 12:11:23 -07:00
if stale:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
else:
return False, existing
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
except FileExistsError:
return False, _read_json_file(lock_path)
try:
with os.fdopen(fd, "w", encoding="utf-8") as handle:
json.dump(record, handle)
except Exception:
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
raise
return True, None
def release_scoped_lock(scope: str, identity: str) -> None:
"""Release a previously-acquired scope lock when owned by this process."""
lock_path = _get_scope_lock_path(scope, identity)
existing = _read_json_file(lock_path)
if not existing:
return
if existing.get("pid") != os.getpid():
return
if existing.get("start_time") != _get_process_start_time(os.getpid()):
return
try:
lock_path.unlink(missing_ok=True)
except OSError:
pass
def release_all_scoped_locks() -> int:
"""Remove all scoped lock files in the lock directory.
Called during --replace to clean up stale locks left by stopped/killed
gateway processes that did not release their locks gracefully.
Returns the number of lock files removed.
"""
lock_dir = _get_lock_dir()
removed = 0
if lock_dir.exists():
for lock_file in lock_dir.glob("*.lock"):
try:
lock_file.unlink(missing_ok=True)
removed += 1
except OSError:
pass
return removed
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.
Checks the PID file and verifies the process is actually alive.
Cleans up stale PID files automatically.
"""
record = _read_pid_record()
if not record:
remove_pid_file()
return None
try:
pid = int(record["pid"])
except (KeyError, TypeError, ValueError):
remove_pid_file()
return None
try:
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
except (ProcessLookupError, PermissionError):
remove_pid_file()
return None
recorded_start = record.get("start_time")
current_start = _get_process_start_time(pid)
if recorded_start is not None and current_start is not None and current_start != recorded_start:
remove_pid_file()
return None
if not _looks_like_gateway_process(pid):
if not _record_looks_like_gateway(record):
remove_pid_file()
return None
return pid
def is_gateway_running() -> bool:
"""Check if the gateway daemon is currently running."""
return get_running_pid() is not None