timmy-config/fleet/auto_restart.py

#!/usr/bin/env python3
"""
Auto-Restart Agent — Self-healing process monitor for fleet machines.

Detects dead services and restarts them automatically.
Escalates after 3 attempts (prevents restart loops).
Logs all actions to ~/.local/timmy/fleet-health/restarts.log
Alerts via Telegram if service cannot be recovered.

Prerequisite: FLEET-006 (health check) must be running to detect failures.

Usage:
  python3 auto_restart.py          # Run checks now
  python3 auto_restart.py --daemon # Run continuously (every 60s)
  python3 auto_restart.py --status  # Show restart history
"""

import os
import sys
import json
import time
import subprocess
from datetime import datetime, timezone
from pathlib import Path

# === CONFIG ===
LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health"))
RESTART_LOG = LOG_DIR / "restarts.log"
COOLDOWN_FILE = LOG_DIR / "restart_cooldowns.json"
MAX_RETRIES = 3
COOLDOWN_PERIOD = 3600  # 1 hour between escalation alerts

# Services definition: name, check command, restart command
# Local services:
LOCAL_SERVICES = {
    "hermes-gateway": {
        "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
        "restart": "cd ~/code-claw && ./restart-gateway.sh 2>/dev/null || launchctl kickstart -k ai.hermes.gateway 2>/dev/null",
        "critical": True,
    },
    "ollama": {
        "check": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null",
        "restart": "launchctl kickstart -k com.ollama.ollama 2>/dev/null || /opt/homebrew/bin/brew services restart ollama 2>/dev/null",
        "critical": False,
    },
    "codeclaw-heartbeat": {
        "check": "launchctl list | grep 'ai.timmy.codeclaw-qwen-heartbeat' > /dev/null 2>/dev/null",
        "restart": "launchctl kickstart -k ai.timmy.codeclaw-qwen-heartbeat 2>/dev/null",
        "critical": False,
    },
}

# VPS services to restart via SSH
VPS_SERVICES = {
    "ezra": {
        "ip": "143.198.27.163",
        "user": "root",
        "services": {
            "gitea": {
                "check": "systemctl is-active gitea 2>/dev/null | grep -q active",
                "restart": "systemctl restart gitea 2>/dev/null",
                "critical": True,
            },
            "nginx": {
                "check": "systemctl is-active nginx 2>/dev/null | grep -q active",
                "restart": "systemctl restart nginx 2>/dev/null",
                "critical": False,
            },
            "hermes-agent": {
                "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
                "restart": "cd /root/wizards/ezra/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
                "critical": True,
            },
        },
    },
    "allegro": {
        "ip": "167.99.126.228",
        "user": "root",
        "services": {
            "hermes-agent": {
                "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
                "restart": "cd /root/wizards/allegro/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
                "critical": True,
            },
        },
    },
    "bezalel": {
        "ip": "159.203.146.185",
        "user": "root",
        "services": {
            "hermes-agent": {
                "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
                "restart": "cd /root/wizards/bezalel/hermes/venv/bin/activate && nohup hermes gateway run > /dev/null 2>&1 &",
                "critical": True,
            },
            "evennia": {
                "check": "pgrep -f 'evennia' > /dev/null 2>/dev/null",
                "restart": "cd /root/.evennia/timmy_world && evennia restart 2>/dev/null",
                "critical": False,
            },
        },
    },
}

TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot"))
TELEGRAM_CHAT = "-1003664764329"


def send_telegram(message):
    if not TELEGRAM_TOKEN_FILE.exists():
        return False
    token = TELEGRAM_TOKEN_FILE.read_text().strip()
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    body = json.dumps({
        "chat_id": TELEGRAM_CHAT,
        "text": f"[AUTO-RESTART]\n{message}",
    }).encode()
    try:
        import urllib.request
        req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
        urllib.request.urlopen(req, timeout=10)
        return True
    except Exception:
        return False


def get_cooldowns():
    if COOLDOWN_FILE.exists():
        try:
            return json.loads(COOLDOWN_FILE.read_text())
        except json.JSONDecodeError:
            pass
    return {}


def save_cooldowns(data):
    COOLDOWN_FILE.write_text(json.dumps(data, indent=2))


def check_service(check_cmd, timeout=10):
    try:
        proc = subprocess.run(check_cmd, shell=True, capture_output=True, timeout=timeout)
        return proc.returncode == 0
    except (subprocess.TimeoutExpired, subprocess.SubprocessError):
        return False


def restart_service(restart_cmd, timeout=30):
    try:
        proc = subprocess.run(restart_cmd, shell=True, capture_output=True, timeout=timeout)
        return proc.returncode == 0
    except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
        return False


def try_restart_via_ssh(name, host_config, service_name):
    ip = host_config["ip"]
    user = host_config["user"]
    service = host_config["services"][service_name]

    restart_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} "{service["restart"]}"'
    return restart_service(restart_cmd, timeout=30)


def log_restart(service_name, machine, attempt, success):
    ts = datetime.now(timezone.utc).isoformat()
    status = "SUCCESS" if success else "FAILED"
    log_entry = f"{ts} [{status}] {machine}/{service_name} (attempt {attempt})\n"

    RESTART_LOG.parent.mkdir(parents=True, exist_ok=True)
    with open(RESTART_LOG, "a") as f:
        f.write(log_entry)

    print(f"  [{status}] {machine}/{service_name} - attempt {attempt}")


def check_and_restart():
    """Run all restart checks."""
    results = []
    cooldowns = get_cooldowns()
    now = time.time()

    # Check local services
    for name, service in LOCAL_SERVICES.items():
        if not check_service(service["check"]):
            cooldown_key = f"local/{name}"
            retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)

            if retries >= MAX_RETRIES:
                last = cooldowns.get(cooldown_key, {}).get("last", 0)
                if now - last < COOLDOWN_PERIOD and service["critical"]:
                    send_telegram(f"CRITICAL: local/{name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
                    cooldowns[cooldown_key] = {"count": 0, "last": now}
                    save_cooldowns(cooldowns)
                    continue

            success = restart_service(service["restart"])
            log_restart(name, "local", retries + 1, success)

            cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
            save_cooldowns(cooldowns)
            if success:
                # Verify it actually started
                time.sleep(3)
                if check_service(service["check"]):
                    print(f"  VERIFIED: local/{name} is running")
                else:
                    print(f"  WARNING: local/{name} restart command returned success but process not detected")

    # Check VPS services
    for host, host_config in VPS_SERVICES.items():
        for service_name, service in host_config["services"].items():
            check_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host_config["user"]}@{host_config["ip"]} "{service["check"]}"'
            if not check_service(check_cmd):
                cooldown_key = f"{host}/{service_name}"
                retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)

                if retries >= MAX_RETRIES:
                    last = cooldowns.get(cooldown_key, {}).get("last", 0)
                    if now - last < COOLDOWN_PERIOD and service["critical"]:
                        send_telegram(f"CRITICAL: {host}/{service_name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
                        cooldowns[cooldown_key] = {"count": 0, "last": now}
                        save_cooldowns(cooldowns)
                        continue

                success = try_restart_via_ssh(host, host_config, service_name)
                log_restart(service_name, host, retries + 1, success)

                cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
                save_cooldowns(cooldowns)

    return results


def daemon_mode():
    """Run continuously every 60 seconds."""
    print("Auto-restart agent running in daemon mode (60s interval)")
    print(f"Monitoring {len(LOCAL_SERVICES)} local + {sum(len(h['services']) for h in VPS_SERVICES.values())} remote services")
    print(f"Max retries per cycle: {MAX_RETRIES}")
    print(f"Cooldown after max retries: {COOLDOWN_PERIOD}s")
    while True:
        check_and_restart()
        time.sleep(60)


def show_status():
    """Show restart history and cooldowns."""
    cooldowns = get_cooldowns()
    print("=== Restart Cooldowns ===")
    for key, data in sorted(cooldowns.items()):
        count = data.get("count", 0)
        if count > 0:
            print(f"  {key}: {count} failures, last at {datetime.fromtimestamp(data.get('last',0), tz=timezone.utc).strftime('%H:%M')}")

    print("\n=== Restart Log (last 20) ===")
    if RESTART_LOG.exists():
        lines = RESTART_LOG.read_text().strip().split("\n")
        for line in lines[-20:]:
            print(f"  {line}")
    else:
        print("  No restarts logged yet.")


if __name__ == "__main__":
    LOG_DIR.mkdir(parents=True, exist_ok=True)

    if len(sys.argv) > 1 and sys.argv[1] == "--daemon":
        daemon_mode()
    elif len(sys.argv) > 1 and sys.argv[1] == "--status":
        show_status()
    else:
        check_and_restart()