#!/usr/bin/env python3 """ Auto-Restart Agent — Self-healing process monitor for fleet machines. Detects dead services and restarts them automatically. Escalates after 3 attempts (prevents restart loops). Logs all actions to ~/.local/timmy/fleet-health/restarts.log Alerts via Telegram if service cannot be recovered. Prerequisite: FLEET-006 (health check) must be running to detect failures. Usage: python3 auto_restart.py # Run checks now python3 auto_restart.py --daemon # Run continuously (every 60s) python3 auto_restart.py --status # Show restart history """ import os import sys import json import time import subprocess from datetime import datetime, timezone from pathlib import Path # === CONFIG === LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health")) RESTART_LOG = LOG_DIR / "restarts.log" COOLDOWN_FILE = LOG_DIR / "restart_cooldowns.json" MAX_RETRIES = 3 COOLDOWN_PERIOD = 3600 # 1 hour between escalation alerts # Services definition: name, check command, restart command # Local services: LOCAL_SERVICES = { "hermes-gateway": { "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null", "restart": "cd ~/code-claw && ./restart-gateway.sh 2>/dev/null || launchctl kickstart -k ai.hermes.gateway 2>/dev/null", "critical": True, }, "ollama": { "check": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null", "restart": "launchctl kickstart -k com.ollama.ollama 2>/dev/null || /opt/homebrew/bin/brew services restart ollama 2>/dev/null", "critical": False, }, "codeclaw-heartbeat": { "check": "launchctl list | grep 'ai.timmy.codeclaw-qwen-heartbeat' > /dev/null 2>/dev/null", "restart": "launchctl kickstart -k ai.timmy.codeclaw-qwen-heartbeat 2>/dev/null", "critical": False, }, } # VPS services to restart via SSH VPS_SERVICES = { "ezra": { "ip": "143.198.27.163", "user": "root", "services": { "gitea": { "check": "systemctl is-active gitea 2>/dev/null | grep -q active", "restart": "systemctl restart gitea 2>/dev/null", "critical": True, }, "nginx": { "check": "systemctl is-active nginx 2>/dev/null | grep -q active", "restart": "systemctl restart nginx 2>/dev/null", "critical": False, }, "hermes-agent": { "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null", "restart": "cd /root/wizards/ezra/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &", "critical": True, }, }, }, "allegro": { "ip": "167.99.126.228", "user": "root", "services": { "hermes-agent": { "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null", "restart": "cd /root/wizards/allegro/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &", "critical": True, }, }, }, "bezalel": { "ip": "159.203.146.185", "user": "root", "services": { "hermes-agent": { "check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null", "restart": "cd /root/wizards/bezalel/hermes/venv/bin/activate && nohup hermes gateway run > /dev/null 2>&1 &", "critical": True, }, "evennia": { "check": "pgrep -f 'evennia' > /dev/null 2>/dev/null", "restart": "cd /root/.evennia/timmy_world && evennia restart 2>/dev/null", "critical": False, }, }, }, } TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot")) TELEGRAM_CHAT = "-1003664764329" def send_telegram(message): if not TELEGRAM_TOKEN_FILE.exists(): return False token = TELEGRAM_TOKEN_FILE.read_text().strip() url = f"https://api.telegram.org/bot{token}/sendMessage" body = json.dumps({ "chat_id": TELEGRAM_CHAT, "text": f"[AUTO-RESTART]\n{message}", }).encode() try: import urllib.request req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST") urllib.request.urlopen(req, timeout=10) return True except Exception: return False def get_cooldowns(): if COOLDOWN_FILE.exists(): try: return json.loads(COOLDOWN_FILE.read_text()) except json.JSONDecodeError: pass return {} def save_cooldowns(data): COOLDOWN_FILE.write_text(json.dumps(data, indent=2)) def check_service(check_cmd, timeout=10): try: proc = subprocess.run(check_cmd, shell=True, capture_output=True, timeout=timeout) return proc.returncode == 0 except (subprocess.TimeoutExpired, subprocess.SubprocessError): return False def restart_service(restart_cmd, timeout=30): try: proc = subprocess.run(restart_cmd, shell=True, capture_output=True, timeout=timeout) return proc.returncode == 0 except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e: return False def try_restart_via_ssh(name, host_config, service_name): ip = host_config["ip"] user = host_config["user"] service = host_config["services"][service_name] restart_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} "{service["restart"]}"' return restart_service(restart_cmd, timeout=30) def log_restart(service_name, machine, attempt, success): ts = datetime.now(timezone.utc).isoformat() status = "SUCCESS" if success else "FAILED" log_entry = f"{ts} [{status}] {machine}/{service_name} (attempt {attempt})\n" RESTART_LOG.parent.mkdir(parents=True, exist_ok=True) with open(RESTART_LOG, "a") as f: f.write(log_entry) print(f" [{status}] {machine}/{service_name} - attempt {attempt}") def check_and_restart(): """Run all restart checks.""" results = [] cooldowns = get_cooldowns() now = time.time() # Check local services for name, service in LOCAL_SERVICES.items(): if not check_service(service["check"]): cooldown_key = f"local/{name}" retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0) if retries >= MAX_RETRIES: last = cooldowns.get(cooldown_key, {}).get("last", 0) if now - last < COOLDOWN_PERIOD and service["critical"]: send_telegram(f"CRITICAL: local/{name} failed {MAX_RETRIES} restart attempts. Needs human intervention.") cooldowns[cooldown_key] = {"count": 0, "last": now} save_cooldowns(cooldowns) continue success = restart_service(service["restart"]) log_restart(name, "local", retries + 1, success) cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now} save_cooldowns(cooldowns) if success: # Verify it actually started time.sleep(3) if check_service(service["check"]): print(f" VERIFIED: local/{name} is running") else: print(f" WARNING: local/{name} restart command returned success but process not detected") # Check VPS services for host, host_config in VPS_SERVICES.items(): for service_name, service in host_config["services"].items(): check_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host_config["user"]}@{host_config["ip"]} "{service["check"]}"' if not check_service(check_cmd): cooldown_key = f"{host}/{service_name}" retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0) if retries >= MAX_RETRIES: last = cooldowns.get(cooldown_key, {}).get("last", 0) if now - last < COOLDOWN_PERIOD and service["critical"]: send_telegram(f"CRITICAL: {host}/{service_name} failed {MAX_RETRIES} restart attempts. Needs human intervention.") cooldowns[cooldown_key] = {"count": 0, "last": now} save_cooldowns(cooldowns) continue success = try_restart_via_ssh(host, host_config, service_name) log_restart(service_name, host, retries + 1, success) cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now} save_cooldowns(cooldowns) return results def daemon_mode(): """Run continuously every 60 seconds.""" print("Auto-restart agent running in daemon mode (60s interval)") print(f"Monitoring {len(LOCAL_SERVICES)} local + {sum(len(h['services']) for h in VPS_SERVICES.values())} remote services") print(f"Max retries per cycle: {MAX_RETRIES}") print(f"Cooldown after max retries: {COOLDOWN_PERIOD}s") while True: check_and_restart() time.sleep(60) def show_status(): """Show restart history and cooldowns.""" cooldowns = get_cooldowns() print("=== Restart Cooldowns ===") for key, data in sorted(cooldowns.items()): count = data.get("count", 0) if count > 0: print(f" {key}: {count} failures, last at {datetime.fromtimestamp(data.get('last',0), tz=timezone.utc).strftime('%H:%M')}") print("\n=== Restart Log (last 20) ===") if RESTART_LOG.exists(): lines = RESTART_LOG.read_text().strip().split("\n") for line in lines[-20:]: print(f" {line}") else: print(" No restarts logged yet.") if __name__ == "__main__": LOG_DIR.mkdir(parents=True, exist_ok=True) if len(sys.argv) > 1 and sys.argv[1] == "--daemon": daemon_mode() elif len(sys.argv) > 1 and sys.argv[1] == "--status": show_status() else: check_and_restart()