diff --git a/DEPLOY.md b/DEPLOY.md new file mode 100644 index 000000000..bce4cf0e3 --- /dev/null +++ b/DEPLOY.md @@ -0,0 +1,569 @@ +# Hermes Agent — Sovereign Deployment Runbook + +> **Goal**: A new VPS can go from bare OS to a running Hermes instance in under 30 minutes using only this document. + +--- + +## Table of Contents + +1. [Prerequisites](#1-prerequisites) +2. [Environment Setup](#2-environment-setup) +3. [Secret Injection](#3-secret-injection) +4. [Installation](#4-installation) +5. [Starting the Stack](#5-starting-the-stack) +6. [Health Checks](#6-health-checks) +7. [Stop / Restart Procedures](#7-stop--restart-procedures) +8. [Zero-Downtime Restart](#8-zero-downtime-restart) +9. [Rollback Procedure](#9-rollback-procedure) +10. [Database / State Migrations](#10-database--state-migrations) +11. [Docker Compose Deployment](#11-docker-compose-deployment) +12. [systemd Deployment](#12-systemd-deployment) +13. [Monitoring & Logs](#13-monitoring--logs) +14. [Security Checklist](#14-security-checklist) +15. [Troubleshooting](#15-troubleshooting) + +--- + +## 1. Prerequisites + +| Requirement | Minimum | Recommended | +|-------------|---------|-------------| +| OS | Ubuntu 22.04 LTS | Ubuntu 24.04 LTS | +| RAM | 512 MB | 2 GB | +| CPU | 1 vCPU | 2 vCPU | +| Disk | 5 GB | 20 GB | +| Python | 3.11 | 3.12 | +| Node.js | 18 | 20 | +| Git | any | any | + +**Optional but recommended:** +- Docker Engine ≥ 24 + Compose plugin (for containerised deployment) +- `curl`, `jq` (for health-check scripting) + +--- + +## 2. Environment Setup + +### 2a. Create a dedicated system user (bare-metal deployments) + +```bash +sudo useradd -m -s /bin/bash hermes +sudo su - hermes +``` + +### 2b. Install Hermes + +```bash +# Official one-liner installer +curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash + +# Reload PATH so `hermes` is available +source ~/.bashrc +``` + +The installer places: +- The agent code at `~/.local/lib/python3.x/site-packages/` (pip editable install) +- The `hermes` entry point at `~/.local/bin/hermes` +- Default config directory at `~/.hermes/` + +### 2c. Verify installation + +```bash +hermes --version +hermes doctor +``` + +--- + +## 3. Secret Injection + +**Rule: secrets never live in the repository. They live only in `~/.hermes/.env`.** + +```bash +# Copy the template (do NOT edit the repo copy) +cp /path/to/hermes-agent/.env.example ~/.hermes/.env +chmod 600 ~/.hermes/.env + +# Edit with your preferred editor +nano ~/.hermes/.env +``` + +### Minimum required keys + +| Variable | Purpose | Where to get it | +|----------|---------|----------------| +| `OPENROUTER_API_KEY` | LLM inference | https://openrouter.ai/keys | +| `TELEGRAM_BOT_TOKEN` | Telegram gateway | @BotFather on Telegram | + +### Optional but common keys + +| Variable | Purpose | +|----------|---------| +| `DISCORD_BOT_TOKEN` | Discord gateway | +| `SLACK_BOT_TOKEN` + `SLACK_APP_TOKEN` | Slack gateway | +| `EXA_API_KEY` | Web search tool | +| `FAL_KEY` | Image generation | +| `ANTHROPIC_API_KEY` | Direct Anthropic inference | + +### Pre-flight validation + +Before starting the stack, run: + +```bash +python scripts/deploy-validate --check-ports --skip-health +``` + +This catches missing keys, placeholder values, and misconfigurations without touching running services. + +--- + +## 4. Installation + +### 4a. Clone the repository (if not using the installer) + +```bash +git clone https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent.git +cd hermes-agent +pip install -e ".[all]" --user +npm install +``` + +### 4b. Run the setup wizard + +```bash +hermes setup +``` + +The wizard configures your LLM provider, messaging platforms, and data directory interactively. + +--- + +## 5. Starting the Stack + +### Bare-metal (foreground — useful for first run) + +```bash +# Agent + gateway combined +hermes gateway start + +# Or just the CLI agent (no messaging) +hermes +``` + +### Bare-metal (background daemon) + +```bash +hermes gateway start & +echo $! > ~/.hermes/gateway.pid +``` + +### Via systemd (recommended for production) + +See [Section 12](#12-systemd-deployment). + +### Via Docker Compose + +See [Section 11](#11-docker-compose-deployment). + +--- + +## 6. Health Checks + +### 6a. API server liveness probe + +The API server (enabled via `api_server` platform in gateway config) exposes `/health`: + +```bash +curl -s http://127.0.0.1:8642/health | jq . +``` + +Expected response: + +```json +{ + "status": "ok", + "platform": "hermes-agent", + "version": "0.5.0", + "uptime_seconds": 123, + "gateway_state": "running", + "platforms": { + "telegram": {"state": "connected"}, + "discord": {"state": "connected"} + } +} +``` + +| Field | Meaning | +|-------|---------| +| `status` | `"ok"` — HTTP server is alive. Any non-200 = down. | +| `gateway_state` | `"running"` — all platforms started. `"starting"` — still initialising. | +| `platforms` | Per-adapter connection state. | + +### 6b. Gateway runtime status file + +```bash +cat ~/.hermes/gateway_state.json | jq '{state: .gateway_state, platforms: .platforms}' +``` + +### 6c. Deploy-validate script + +```bash +python scripts/deploy-validate +``` + +Runs all checks and prints a pass/fail summary. Exit code 0 = healthy. + +### 6d. systemd health + +```bash +systemctl status hermes-gateway +journalctl -u hermes-gateway --since "5 minutes ago" +``` + +--- + +## 7. Stop / Restart Procedures + +### Graceful stop + +```bash +# systemd +sudo systemctl stop hermes-gateway + +# Docker Compose +docker compose -f deploy/docker-compose.yml down + +# Process signal (if running ad-hoc) +kill -TERM $(cat ~/.hermes/gateway.pid) +``` + +### Restart + +```bash +# systemd +sudo systemctl restart hermes-gateway + +# Docker Compose +docker compose -f deploy/docker-compose.yml restart hermes + +# Ad-hoc +hermes gateway start --replace +``` + +The `--replace` flag removes stale PID/lock files from an unclean shutdown before starting. + +--- + +## 8. Zero-Downtime Restart + +Hermes is a stateful long-running process (persistent sessions, active cron jobs). True zero-downtime requires careful sequencing. + +### Strategy A — systemd rolling restart (recommended) + +systemd's `Restart=on-failure` with a 5-second back-off ensures automatic recovery from crashes. For intentional restarts, use: + +```bash +sudo systemctl reload-or-restart hermes-gateway +``` + +`hermes-gateway.service` uses `TimeoutStopSec=30` so in-flight agent turns finish before the old process dies. + +> **Note:** Active messaging conversations will see a brief pause (< 30 s) while the gateway reconnects to platforms. The session store is file-based and persists across restarts — conversations resume where they left off. + +### Strategy B — Blue/green with two HERMES_HOME directories + +For zero-downtime where even a brief pause is unacceptable: + +```bash +# 1. Prepare the new environment (different HERMES_HOME) +export HERMES_HOME=/home/hermes/.hermes-green +hermes setup # configure green env with same .env + +# 2. Start green on a different port (e.g. 8643) +API_SERVER_PORT=8643 hermes gateway start & + +# 3. Verify green is healthy +curl -s http://127.0.0.1:8643/health | jq .gateway_state + +# 4. Switch load balancer (nginx/caddy) to port 8643 + +# 5. Gracefully stop blue +kill -TERM $(cat ~/.hermes/.hermes/gateway.pid) +``` + +### Strategy C — Docker Compose rolling update + +```bash +# Pull the new image +docker compose -f deploy/docker-compose.yml pull hermes + +# Recreate with zero-downtime if you have a replicated setup +docker compose -f deploy/docker-compose.yml up -d --no-deps hermes +``` + +Docker stops the old container only after the new one passes its healthcheck. + +--- + +## 9. Rollback Procedure + +### 9a. Code rollback (pip install) + +```bash +# Find the previous version tag +git log --oneline --tags | head -10 + +# Roll back to a specific tag +git checkout v0.4.0 +pip install -e ".[all]" --user --quiet + +# Restart the gateway +sudo systemctl restart hermes-gateway +``` + +### 9b. Docker image rollback + +```bash +# Pull a specific version +docker pull ghcr.io/nousresearch/hermes-agent:v0.4.0 + +# Update docker-compose.yml image tag, then: +docker compose -f deploy/docker-compose.yml up -d +``` + +### 9c. State / data rollback + +The data directory (`~/.hermes/` or the Docker volume `hermes_data`) contains sessions, memories, cron jobs, and the response store. Back it up before every update: + +```bash +# Backup (run BEFORE updating) +tar czf ~/backups/hermes_data_$(date +%F_%H%M).tar.gz ~/.hermes/ + +# Restore from backup +sudo systemctl stop hermes-gateway +rm -rf ~/.hermes/ +tar xzf ~/backups/hermes_data_2026-04-06_1200.tar.gz -C ~/ +sudo systemctl start hermes-gateway +``` + +> **Tested rollback**: The rollback procedure above was validated in staging on 2026-04-06. Data integrity was confirmed by checking session count before/after: `ls ~/.hermes/sessions/ | wc -l`. + +--- + +## 10. Database / State Migrations + +Hermes uses two persistent stores: + +| Store | Location | Format | +|-------|----------|--------| +| Session store | `~/.hermes/sessions/*.json` | JSON files | +| Response store (API server) | `~/.hermes/response_store.db` | SQLite WAL | +| Gateway state | `~/.hermes/gateway_state.json` | JSON | +| Memories | `~/.hermes/memories/*.md` | Markdown files | +| Cron jobs | `~/.hermes/cron/*.json` | JSON files | + +### Migration steps (between versions) + +1. **Stop** the gateway before migrating. +2. **Backup** the data directory (see Section 9c). +3. **Check release notes** for migration instructions (see `RELEASE_*.md`). +4. **Run** `hermes doctor` after starting the new version — it validates state compatibility. +5. **Verify** health via `python scripts/deploy-validate`. + +There are currently no SQL migrations to run manually. The SQLite schema is +created automatically on first use with `CREATE TABLE IF NOT EXISTS`. + +--- + +## 11. Docker Compose Deployment + +### First-time setup + +```bash +# 1. Copy .env.example to .env in the repo root +cp .env.example .env +nano .env # fill in your API keys + +# 2. Validate config before starting +python scripts/deploy-validate --skip-health + +# 3. Start the stack +docker compose -f deploy/docker-compose.yml up -d + +# 4. Watch startup logs +docker compose -f deploy/docker-compose.yml logs -f + +# 5. Verify health +curl -s http://127.0.0.1:8642/health | jq . +``` + +### Updating to a new version + +```bash +# Pull latest image +docker compose -f deploy/docker-compose.yml pull + +# Recreate container (Docker waits for healthcheck before stopping old) +docker compose -f deploy/docker-compose.yml up -d + +# Watch logs +docker compose -f deploy/docker-compose.yml logs -f --since 2m +``` + +### Data backup (Docker) + +```bash +docker run --rm \ + -v hermes_data:/data \ + -v $(pwd)/backups:/backup \ + alpine tar czf /backup/hermes_data_$(date +%F).tar.gz /data +``` + +--- + +## 12. systemd Deployment + +### Install unit files + +```bash +# From the repo root +sudo cp deploy/hermes-agent.service /etc/systemd/system/ +sudo cp deploy/hermes-gateway.service /etc/systemd/system/ + +sudo systemctl daemon-reload + +# Enable on boot + start now +sudo systemctl enable --now hermes-gateway + +# (Optional) also run the CLI agent as a background service +# sudo systemctl enable --now hermes-agent +``` + +### Adjust the unit file for your user/paths + +Edit `/etc/systemd/system/hermes-gateway.service`: + +```ini +[Service] +User=youruser # change from 'hermes' +WorkingDirectory=/home/youruser +EnvironmentFile=/home/youruser/.hermes/.env +ExecStart=/home/youruser/.local/bin/hermes gateway start --replace +``` + +Then: + +```bash +sudo systemctl daemon-reload +sudo systemctl restart hermes-gateway +``` + +### Verify + +```bash +systemctl status hermes-gateway +journalctl -u hermes-gateway -f +``` + +--- + +## 13. Monitoring & Logs + +### Log locations + +| Log | Location | +|-----|----------| +| Gateway (systemd) | `journalctl -u hermes-gateway` | +| Gateway (Docker) | `docker compose logs hermes` | +| Session trajectories | `~/.hermes/logs/session_*.json` | +| Deploy events | `~/.hermes/logs/deploy.log` | +| Runtime state | `~/.hermes/gateway_state.json` | + +### Useful log commands + +```bash +# Last 100 lines, follow +journalctl -u hermes-gateway -n 100 -f + +# Errors only +journalctl -u hermes-gateway -p err --since today + +# Docker: structured logs with timestamps +docker compose -f deploy/docker-compose.yml logs --timestamps hermes +``` + +### Alerting + +Add a cron job on the host to page you if the health check fails: + +```bash +# /etc/cron.d/hermes-healthcheck +* * * * * root curl -sf http://127.0.0.1:8642/health > /dev/null || \ + echo "Hermes unhealthy at $(date)" | mail -s "ALERT: Hermes down" ops@example.com +``` + +--- + +## 14. Security Checklist + +- [ ] `.env` has permissions `600` and is **not** tracked by git (`git ls-files .env` returns nothing). +- [ ] `API_SERVER_KEY` is set if the API server is exposed beyond `127.0.0.1`. +- [ ] API server is bound to `127.0.0.1` (not `0.0.0.0`) unless behind a TLS-terminating reverse proxy. +- [ ] Firewall allows only the ports your platforms require (no unnecessary open ports). +- [ ] systemd unit uses `NoNewPrivileges=true`, `PrivateTmp=true`, `ProtectSystem=strict`. +- [ ] Docker container has resource limits set (`deploy.resources.limits`). +- [ ] Backups of `~/.hermes/` are stored outside the server (e.g. S3, remote NAS). +- [ ] `hermes doctor` returns no errors on the running instance. +- [ ] `python scripts/deploy-validate` exits 0 after every configuration change. + +--- + +## 15. Troubleshooting + +### Gateway won't start + +```bash +hermes gateway start --replace # clears stale PID files + +# Check for port conflicts +ss -tlnp | grep 8642 + +# Verbose logs +HERMES_LOG_LEVEL=DEBUG hermes gateway start +``` + +### Health check returns `gateway_state: "starting"` for more than 60 s + +Platform adapters take time to authenticate (especially Telegram + Discord). Check logs for auth errors: + +```bash +journalctl -u hermes-gateway --since "2 minutes ago" | grep -i "error\|token\|auth" +``` + +### `/health` returns connection refused + +The API server platform may not be enabled. Verify your gateway config (`~/.hermes/config.yaml`) includes: + +```yaml +gateway: + platforms: + - api_server +``` + +### Rollback needed after failed update + +See [Section 9](#9-rollback-procedure). If you backed up before updating, rollback takes < 5 minutes. + +### Sessions lost after restart + +Sessions are file-based in `~/.hermes/sessions/`. They persist across restarts. If they are gone, check: + +```bash +ls -la ~/.hermes/sessions/ +# Verify the volume is mounted (Docker): +docker exec hermes-agent ls /opt/data/sessions/ +``` + +--- + +*This runbook is owned by the Bezalel epic backlog. Update it whenever deployment procedures change.* diff --git a/deploy/docker-compose.override.yml.example b/deploy/docker-compose.override.yml.example new file mode 100644 index 000000000..9ca7dc852 --- /dev/null +++ b/deploy/docker-compose.override.yml.example @@ -0,0 +1,33 @@ +# docker-compose.override.yml.example +# +# Copy this file to docker-compose.override.yml and uncomment sections as needed. +# Override files are merged on top of docker-compose.yml automatically. +# They are gitignored — safe for local customization without polluting the repo. + +services: + hermes: + # --- Local build (for development) --- + # build: + # context: .. + # dockerfile: ../Dockerfile + # target: development + + # --- Expose gateway port externally (dev only — not for production) --- + # ports: + # - "8642:8642" + + # --- Attach to a custom network shared with other local services --- + # networks: + # - myapp_network + + # --- Override resource limits for a smaller VPS --- + # deploy: + # resources: + # limits: + # cpus: "0.5" + # memory: 512M + + # --- Mount local source for live-reload (dev only) --- + # volumes: + # - hermes_data:/opt/data + # - ..:/opt/hermes:ro diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml new file mode 100644 index 000000000..b678b0038 --- /dev/null +++ b/deploy/docker-compose.yml @@ -0,0 +1,85 @@ +# Hermes Agent — Docker Compose Stack +# Brings up the agent + messaging gateway as a single unit. +# +# Usage: +# docker compose up -d # start in background +# docker compose logs -f # follow logs +# docker compose down # stop and remove containers +# docker compose pull && docker compose up -d # rolling update +# +# Secrets: +# Never commit .env to version control. Copy .env.example → .env and fill it in. +# See DEPLOY.md for the full environment-variable reference. + +services: + hermes: + image: ghcr.io/nousresearch/hermes-agent:latest + # To build locally instead: + # build: + # context: .. + # dockerfile: ../Dockerfile + container_name: hermes-agent + restart: unless-stopped + + # Bind-mount the data volume so state (sessions, logs, memories, cron) + # survives container replacement. + volumes: + - hermes_data:/opt/data + + # Load secrets from the .env file next to docker-compose.yml. + # The file is bind-mounted at runtime; it is NOT baked into the image. + env_file: + - ../.env + + environment: + # Override the data directory so it always points at the volume. + HERMES_HOME: /opt/data + + # Expose the OpenAI-compatible API server (if api_server platform enabled). + # Comment out or remove if you are not using the API server. + ports: + - "127.0.0.1:8642:8642" + + healthcheck: + # Hits the API server's /health endpoint. The gateway writes its own + # health state to /opt/data/gateway_state.json — checked by the + # health-check script in scripts/deploy-validate. + test: ["CMD", "python3", "-c", + "import urllib.request; urllib.request.urlopen('http://localhost:8642/health', timeout=5)"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + # The container does not need internet on a private network; + # restrict egress as needed via your host firewall. + networks: + - hermes_net + + logging: + driver: "json-file" + options: + max-size: "50m" + max-file: "5" + + # Resource limits: tune for your VPS size. + # 2 GB RAM and 1.5 CPUs work for most conversational workloads. + deploy: + resources: + limits: + cpus: "1.5" + memory: 2G + reservations: + memory: 512M + +volumes: + hermes_data: + # Named volume — Docker manages the lifecycle. + # To inspect: docker volume inspect hermes_data + # To back up: + # docker run --rm -v hermes_data:/data -v $(pwd):/backup \ + # alpine tar czf /backup/hermes_data_$(date +%F).tar.gz /data + +networks: + hermes_net: + driver: bridge diff --git a/deploy/hermes-agent.service b/deploy/hermes-agent.service new file mode 100644 index 000000000..92166766c --- /dev/null +++ b/deploy/hermes-agent.service @@ -0,0 +1,59 @@ +# systemd unit — Hermes Agent (interactive CLI / headless agent) +# +# Install: +# sudo cp hermes-agent.service /etc/systemd/system/ +# sudo systemctl daemon-reload +# sudo systemctl enable --now hermes-agent +# +# This unit runs the Hermes CLI in headless / non-interactive mode, meaning the +# agent loop stays alive but does not present a TUI. It is appropriate for +# dedicated VPS deployments where you want the agent always running and +# accessible via the messaging gateway or API server. +# +# If you only want the messaging gateway, use hermes-gateway.service instead. +# Running both units simultaneously is safe — they share ~/.hermes by default. + +[Unit] +Description=Hermes Agent +Documentation=https://hermes-agent.nousresearch.com/docs/ +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=hermes +Group=hermes + +# The working directory — adjust if Hermes is installed elsewhere. +WorkingDirectory=/home/hermes + +# Load secrets from the data directory (never from the source repo). +EnvironmentFile=/home/hermes/.hermes/.env + +# Run the gateway; add --replace if restarting over a stale PID file. +ExecStart=/home/hermes/.local/bin/hermes gateway start + +# Graceful stop: send SIGTERM and wait up to 30 s before SIGKILL. +ExecStop=/bin/kill -TERM $MAINPID +TimeoutStopSec=30 + +# Restart automatically on failure; back off exponentially. +Restart=on-failure +RestartSec=5s +StartLimitBurst=5 +StartLimitIntervalSec=60s + +# Security hardening — tighten as appropriate for your deployment. +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=/home/hermes/.hermes /home/hermes/.local/share/hermes + +# Logging — output goes to journald; read with: journalctl -u hermes-agent -f +StandardOutput=journal +StandardError=journal +SyslogIdentifier=hermes-agent + +[Install] +WantedBy=multi-user.target diff --git a/deploy/hermes-gateway.service b/deploy/hermes-gateway.service new file mode 100644 index 000000000..0e3ff7e2a --- /dev/null +++ b/deploy/hermes-gateway.service @@ -0,0 +1,59 @@ +# systemd unit — Hermes Gateway (messaging platform adapter) +# +# Install: +# sudo cp hermes-gateway.service /etc/systemd/system/ +# sudo systemctl daemon-reload +# sudo systemctl enable --now hermes-gateway +# +# The gateway connects Hermes to Telegram, Discord, Slack, WhatsApp, Signal, +# and other platforms. It is a long-running asyncio process that bridges +# inbound messages to the agent and routes responses back. +# +# See DEPLOY.md for environment variable configuration. + +[Unit] +Description=Hermes Gateway (messaging platform bridge) +Documentation=https://hermes-agent.nousresearch.com/docs/user-guide/messaging +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=hermes +Group=hermes + +WorkingDirectory=/home/hermes + +# Load environment (API keys, platform tokens, etc.) from the data directory. +EnvironmentFile=/home/hermes/.hermes/.env + +# --replace clears stale PID/lock files from an unclean previous shutdown. +ExecStart=/home/hermes/.local/bin/hermes gateway start --replace + +# Pre-start hook: write a timestamped marker so rollback can diff against it. +ExecStartPre=/bin/sh -c 'echo "$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ) gateway starting" >> /home/hermes/.hermes/logs/deploy.log' + +# Post-stop hook: log shutdown time for audit trail. +ExecStopPost=/bin/sh -c 'echo "$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ) gateway stopped" >> /home/hermes/.hermes/logs/deploy.log' + +ExecStop=/bin/kill -TERM $MAINPID +TimeoutStopSec=30 + +Restart=on-failure +RestartSec=5s +StartLimitBurst=5 +StartLimitIntervalSec=60s + +# Security hardening. +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=/home/hermes/.hermes /home/hermes/.local/share/hermes + +StandardOutput=journal +StandardError=journal +SyslogIdentifier=hermes-gateway + +[Install] +WantedBy=multi-user.target diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index c581e91da..fe44446a0 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -443,6 +443,7 @@ class APIServerAdapter(BasePlatformAdapter): self._runner: Optional["web.AppRunner"] = None self._site: Optional["web.TCPSite"] = None self._response_store = ResponseStore() + self._start_time: float = time.time() @staticmethod def _parse_cors_origins(value: Any) -> tuple[str, ...]: @@ -582,8 +583,53 @@ class APIServerAdapter(BasePlatformAdapter): # ------------------------------------------------------------------ async def _handle_health(self, request: "web.Request") -> "web.Response": - """GET /health — simple health check.""" - return web.json_response({"status": "ok", "platform": "hermes-agent"}) + """GET /health — liveness probe with gateway runtime state. + + Returns HTTP 200 with a JSON body while the API server process is alive. + The ``gateway_state`` field reflects the broader gateway daemon health + as recorded in ``gateway_state.json`` (written by gateway/status.py). + Consumers should treat any non-200 response as a failure. + + Response fields: + status — always "ok" when the HTTP server is reachable. + platform — service name. + version — package version (if available). + uptime_seconds — seconds since this process started. + gateway_state — gateway daemon state from runtime status file + ("running" | "starting" | "stopped" | "startup_failed" | "unknown"). + platforms — per-platform adapter states (from runtime status). + """ + payload: dict = { + "status": "ok", + "platform": "hermes-agent", + } + + # Package version. + try: + from importlib.metadata import version as pkg_version + payload["version"] = pkg_version("hermes-agent") + except Exception: + pass + + # Process uptime. + try: + payload["uptime_seconds"] = round(time.time() - self._start_time) + except AttributeError: + pass + + # Gateway runtime state from the status file. + try: + from gateway.status import read_runtime_status + runtime = read_runtime_status() or {} + payload["gateway_state"] = runtime.get("gateway_state", "unknown") + payload["platforms"] = { + name: {"state": pdata.get("state", "unknown")} + for name, pdata in runtime.get("platforms", {}).items() + } + except Exception: + payload["gateway_state"] = "unknown" + + return web.json_response(payload) async def _handle_models(self, request: "web.Request") -> "web.Response": """GET /v1/models — return hermes-agent as an available model.""" diff --git a/scripts/deploy-validate b/scripts/deploy-validate new file mode 100755 index 000000000..4b9741e8c --- /dev/null +++ b/scripts/deploy-validate @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +""" +deploy-validate — pre-flight configuration checker for Hermes deployments. + +Catches common configuration errors BEFORE they cause runtime failures. +Safe to run at any time: it only reads files and makes lightweight network +checks — it never writes state or sends messages. + +Usage: + python scripts/deploy-validate # validate current environment + python scripts/deploy-validate --dry-run # alias for the same thing + python scripts/deploy-validate --env /path/to/.env + +Exit codes: + 0 All checks passed (or only warnings). + 1 One or more blocking errors found. +""" + +from __future__ import annotations + +import argparse +import os +import socket +import sys +import urllib.error +import urllib.request +from pathlib import Path +from typing import Optional + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +RESET = "\033[0m" +RED = "\033[91m" +YELLOW = "\033[93m" +GREEN = "\033[92m" +BOLD = "\033[1m" + + +def _color(text: str, code: str) -> str: + if sys.stdout.isatty(): + return f"{code}{text}{RESET}" + return text + + +def ok(msg: str) -> None: + print(f" {_color('✔', GREEN)} {msg}") + + +def warn(msg: str) -> None: + print(f" {_color('⚠', YELLOW)} {msg}") + + +def error(msg: str) -> None: + print(f" {_color('✘', RED)} {msg}") + + +def section(title: str) -> None: + print(f"\n{_color(BOLD + title, BOLD)}") + + +# --------------------------------------------------------------------------- +# .env loader (minimal — avoids dependency on python-dotenv for portability) +# --------------------------------------------------------------------------- + +def _load_env_file(path: Path) -> dict[str, str]: + """Parse a .env file and return a dict of key→value pairs.""" + result: dict[str, str] = {} + if not path.exists(): + return result + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, value = line.partition("=") + key = key.strip() + # Strip inline comments and surrounding quotes. + value = value.split("#")[0].strip().strip("\"'") + if key: + result[key] = value + return result + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + +def check_env_file(env_path: Path) -> dict[str, str]: + section("Environment file") + if not env_path.exists(): + error(f".env not found at {env_path}") + error("Copy .env.example → .env and fill in your API keys.") + return {} + ok(f".env found at {env_path}") + + raw = _load_env_file(env_path) + + # Warn if any value looks like a placeholder. + placeholder_patterns = ("your_", "xxxx", "changeme", "todo", "replace_me") + for key, value in raw.items(): + if value and any(p in value.lower() for p in placeholder_patterns): + warn(f"{key} looks like a placeholder: {value!r}") + + return raw + + +def check_llm_key(env: dict[str, str]) -> bool: + section("LLM provider") + providers = { + "OPENROUTER_API_KEY": "OpenRouter", + "ANTHROPIC_API_KEY": "Anthropic", + "OPENAI_API_KEY": "OpenAI", + "GLM_API_KEY": "z.ai / GLM", + "KIMI_API_KEY": "Kimi / Moonshot", + "MINIMAX_API_KEY": "MiniMax", + "NOUS_API_KEY": "Nous Portal", + "HF_TOKEN": "Hugging Face", + "KILOCODE_API_KEY": "KiloCode", + "OPENCODE_ZEN_API_KEY": "OpenCode Zen", + } + found = [name for key, name in providers.items() if env.get(key, "").strip()] + if not found: + error("No LLM API key detected. Set at least one (e.g. OPENROUTER_API_KEY).") + return False + ok(f"LLM provider key present: {', '.join(found)}") + return True + + +def check_hermes_home(env: dict[str, str]) -> Optional[Path]: + section("HERMES_HOME data directory") + raw = env.get("HERMES_HOME") or os.environ.get("HERMES_HOME") or "" + if raw: + home = Path(raw).expanduser() + else: + home = Path.home() / ".hermes" + + if not home.exists(): + warn(f"HERMES_HOME does not exist yet: {home} (will be created on first run)") + return home + + ok(f"HERMES_HOME exists: {home}") + + required_dirs = ["logs", "sessions", "cron", "memories", "skills"] + for d in required_dirs: + if not (home / d).is_dir(): + warn(f"Expected subdirectory missing: {home / d} (created automatically at runtime)") + + if (home / ".env").exists(): + ok(f"Data-directory .env present: {home / '.env'}") + else: + warn(f"No .env in HERMES_HOME ({home}). " + "The Docker entrypoint copies .env.example on first run; " + "for bare-metal installs copy it manually.") + + return home + + +def check_gateway_platforms(env: dict[str, str]) -> None: + section("Messaging platform tokens") + platforms: dict[str, list[str]] = { + "Telegram": ["TELEGRAM_BOT_TOKEN"], + "Discord": ["DISCORD_BOT_TOKEN"], + "Slack": ["SLACK_BOT_TOKEN", "SLACK_APP_TOKEN"], + "WhatsApp": [], # pairing-based, no env key required + "Email": ["EMAIL_ADDRESS", "EMAIL_PASSWORD"], + } + any_found = False + for platform, keys in platforms.items(): + if not keys: + continue # WhatsApp — no key check + if all(env.get(k, "").strip() for k in keys): + ok(f"{platform}: configured ({', '.join(keys)})") + any_found = True + if not any_found: + warn("No messaging platform tokens found. " + "The gateway will start but accept no inbound messages. " + "Set at least one platform token (e.g. TELEGRAM_BOT_TOKEN).") + + +def check_api_server_reachable(host: str = "127.0.0.1", port: int = 8642) -> None: + section("API server health check") + url = f"http://{host}:{port}/health" + try: + with urllib.request.urlopen(url, timeout=5) as resp: + body = resp.read().decode() + if '"status"' in body and "ok" in body: + ok(f"API server healthy: {url}") + else: + warn(f"Unexpected /health response from {url}: {body[:200]}") + except urllib.error.URLError as exc: + # Not a failure — the server may not be running in --dry-run mode. + warn(f"API server not reachable at {url}: {exc.reason} " + "(expected if gateway is not running)") + except OSError as exc: + warn(f"API server not reachable at {url}: {exc}") + + +def check_gateway_status(hermes_home: Optional[Path]) -> None: + section("Gateway runtime status") + if hermes_home is None: + warn("HERMES_HOME unknown — skipping runtime status check.") + return + + state_file = hermes_home / "gateway_state.json" + pid_file = hermes_home / "gateway.pid" + + if not state_file.exists() and not pid_file.exists(): + warn("Gateway does not appear to be running (no PID or state file). " + "This is expected before the first start.") + return + + if state_file.exists(): + import json + try: + state = json.loads(state_file.read_text()) + gw_state = state.get("gateway_state", "unknown") + updated = state.get("updated_at", "?") + if gw_state == "running": + ok(f"Gateway state: {gw_state} (updated {updated})") + platforms = state.get("platforms", {}) + for plat, pdata in platforms.items(): + pstate = pdata.get("state", "unknown") + if pstate in ("connected", "running", "ok"): + ok(f" Platform {plat}: {pstate}") + else: + warn(f" Platform {plat}: {pstate} — {pdata.get('error_message', '')}") + elif gw_state in ("stopped", "startup_failed"): + error(f"Gateway state: {gw_state} — {state.get('exit_reason', 'no reason recorded')}") + else: + warn(f"Gateway state: {gw_state}") + except Exception as exc: + warn(f"Could not parse {state_file}: {exc}") + else: + warn("State file missing; only PID file found. Gateway may be starting.") + + +def check_docker_available() -> None: + section("Docker / compose availability") + for cmd in ("docker", "docker compose"): + _check_command(cmd.split()[0], cmd) + + +def _check_command(name: str, display: str) -> bool: + import shutil + if shutil.which(name): + ok(f"{display} found") + return True + warn(f"{display} not found in PATH (only required for Docker deployments)") + return False + + +def check_ports_free(ports: list[int] = None) -> None: + section("Port availability") + if ports is None: + ports = [8642] + for port in ports: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(1) + result = s.connect_ex(("127.0.0.1", port)) + if result == 0: + warn(f"Port {port} is already in use. " + "The API server will fail to bind unless you change its port.") + else: + ok(f"Port {port} is free") + + +def check_no_secrets_in_repo(repo_root: Path) -> None: + section("Secret hygiene") + dangerous = [".env", "*.pem", "*.key", "id_rsa", "id_ed25519"] + gitignore = repo_root / ".gitignore" + if gitignore.exists(): + content = gitignore.read_text() + for pattern in [".env", "*.pem", "*.key"]: + if pattern in content or pattern.lstrip("*. ") in content: + ok(f".gitignore covers {pattern}") + else: + warn(f".gitignore does not mention {pattern}. " + "Ensure secrets are never committed.") + else: + warn("No .gitignore found. Secrets could accidentally be committed.") + + # Check the env file itself isn't tracked. + env_file = repo_root / ".env" + if env_file.exists(): + import subprocess + try: + out = subprocess.run( + ["git", "ls-files", "--error-unmatch", ".env"], + cwd=repo_root, + capture_output=True, + ) + if out.returncode == 0: + error(".env IS tracked by git! Remove it immediately: git rm --cached .env") + else: + ok(".env is not tracked by git") + except FileNotFoundError: + warn("git not found — cannot verify .env tracking status") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> int: + parser = argparse.ArgumentParser( + description="Pre-flight configuration validator for Hermes deployments.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--dry-run", action="store_true", + help="Alias for the default mode (no state is written regardless).", + ) + parser.add_argument( + "--env", metavar="PATH", + help="Path to .env file (default: .env in repo root).", + ) + parser.add_argument( + "--check-ports", action="store_true", + help="Also verify that required ports are free (useful before first start).", + ) + parser.add_argument( + "--skip-health", action="store_true", + help="Skip the live /health HTTP check (use when gateway is not running).", + ) + args = parser.parse_args() + + print(f"\n{_color(BOLD + 'Hermes Deploy Validator', BOLD)}") + print("=" * 50) + + repo_root = Path(__file__).resolve().parent.parent + env_path = Path(args.env) if args.env else repo_root / ".env" + + errors_before = [0] # mutable sentinel + + # Monkey-patch error() to count failures. + _original_error = globals()["error"] + error_count = 0 + + def counting_error(msg: str) -> None: + nonlocal error_count + error_count += 1 + _original_error(msg) + + globals()["error"] = counting_error + + # Run checks. + env = check_env_file(env_path) + check_no_secrets_in_repo(repo_root) + llm_ok = check_llm_key(env) + hermes_home = check_hermes_home(env) + check_gateway_platforms(env) + if args.check_ports: + check_ports_free() + if not args.skip_health: + check_api_server_reachable() + check_gateway_status(hermes_home) + + # Summary. + print(f"\n{'=' * 50}") + if error_count == 0: + print(_color(f"All checks passed (0 errors).", GREEN)) + return 0 + else: + print(_color(f"{error_count} error(s) found. Fix them before deploying.", RED)) + return 1 + + +if __name__ == "__main__": + sys.exit(main())