Daemon that monitors key services and restarts them automatically: - Local: hermes-gateway, ollama, codeclaw-heartbeat - Ezra: gitea, nginx, hermes-agent - Allegro hermes-agent - Bezalel: hermes-agent, evennia - Max 3 restart attempts per service per cycle (prevents loops) - 1-hour cooldown after max retries with Telegram escalation - Restart log at ~/.local/timmy/fleet-health/restarts.log - Modes: check now (--status for history, --daemon for continuous) Fixes timmy-home#560
273 lines
10 KiB
Python
Executable File
273 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Auto-Restart Agent — Self-healing process monitor for fleet machines.
|
|
|
|
Detects dead services and restarts them automatically.
|
|
Escalates after 3 attempts (prevents restart loops).
|
|
Logs all actions to ~/.local/timmy/fleet-health/restarts.log
|
|
Alerts via Telegram if service cannot be recovered.
|
|
|
|
Prerequisite: FLEET-006 (health check) must be running to detect failures.
|
|
|
|
Usage:
|
|
python3 auto_restart.py # Run checks now
|
|
python3 auto_restart.py --daemon # Run continuously (every 60s)
|
|
python3 auto_restart.py --status # Show restart history
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# === CONFIG ===
|
|
LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health"))
|
|
RESTART_LOG = LOG_DIR / "restarts.log"
|
|
COOLDOWN_FILE = LOG_DIR / "restart_cooldowns.json"
|
|
MAX_RETRIES = 3
|
|
COOLDOWN_PERIOD = 3600 # 1 hour between escalation alerts
|
|
|
|
# Services definition: name, check command, restart command
|
|
# Local services:
|
|
LOCAL_SERVICES = {
|
|
"hermes-gateway": {
|
|
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
|
"restart": "cd ~/code-claw && ./restart-gateway.sh 2>/dev/null || launchctl kickstart -k ai.hermes.gateway 2>/dev/null",
|
|
"critical": True,
|
|
},
|
|
"ollama": {
|
|
"check": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null",
|
|
"restart": "launchctl kickstart -k com.ollama.ollama 2>/dev/null || /opt/homebrew/bin/brew services restart ollama 2>/dev/null",
|
|
"critical": False,
|
|
},
|
|
"codeclaw-heartbeat": {
|
|
"check": "launchctl list | grep 'ai.timmy.codeclaw-qwen-heartbeat' > /dev/null 2>/dev/null",
|
|
"restart": "launchctl kickstart -k ai.timmy.codeclaw-qwen-heartbeat 2>/dev/null",
|
|
"critical": False,
|
|
},
|
|
}
|
|
|
|
# VPS services to restart via SSH
|
|
VPS_SERVICES = {
|
|
"ezra": {
|
|
"ip": "143.198.27.163",
|
|
"user": "root",
|
|
"services": {
|
|
"gitea": {
|
|
"check": "systemctl is-active gitea 2>/dev/null | grep -q active",
|
|
"restart": "systemctl restart gitea 2>/dev/null",
|
|
"critical": True,
|
|
},
|
|
"nginx": {
|
|
"check": "systemctl is-active nginx 2>/dev/null | grep -q active",
|
|
"restart": "systemctl restart nginx 2>/dev/null",
|
|
"critical": False,
|
|
},
|
|
"hermes-agent": {
|
|
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
|
"restart": "cd /root/wizards/ezra/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
|
|
"critical": True,
|
|
},
|
|
},
|
|
},
|
|
"allegro": {
|
|
"ip": "167.99.126.228",
|
|
"user": "root",
|
|
"services": {
|
|
"hermes-agent": {
|
|
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
|
"restart": "cd /root/wizards/allegro/hermes-agent && source .venv/bin/activate && nohup hermes gateway run --replace > /dev/null 2>&1 &",
|
|
"critical": True,
|
|
},
|
|
},
|
|
},
|
|
"bezalel": {
|
|
"ip": "159.203.146.185",
|
|
"user": "root",
|
|
"services": {
|
|
"hermes-agent": {
|
|
"check": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
|
"restart": "cd /root/wizards/bezalel/hermes/venv/bin/activate && nohup hermes gateway run > /dev/null 2>&1 &",
|
|
"critical": True,
|
|
},
|
|
"evennia": {
|
|
"check": "pgrep -f 'evennia' > /dev/null 2>/dev/null",
|
|
"restart": "cd /root/.evennia/timmy_world && evennia restart 2>/dev/null",
|
|
"critical": False,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot"))
|
|
TELEGRAM_CHAT = "-1003664764329"
|
|
|
|
|
|
def send_telegram(message):
|
|
if not TELEGRAM_TOKEN_FILE.exists():
|
|
return False
|
|
token = TELEGRAM_TOKEN_FILE.read_text().strip()
|
|
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
|
body = json.dumps({
|
|
"chat_id": TELEGRAM_CHAT,
|
|
"text": f"[AUTO-RESTART]\n{message}",
|
|
}).encode()
|
|
try:
|
|
import urllib.request
|
|
req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
|
|
urllib.request.urlopen(req, timeout=10)
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def get_cooldowns():
|
|
if COOLDOWN_FILE.exists():
|
|
try:
|
|
return json.loads(COOLDOWN_FILE.read_text())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return {}
|
|
|
|
|
|
def save_cooldowns(data):
|
|
COOLDOWN_FILE.write_text(json.dumps(data, indent=2))
|
|
|
|
|
|
def check_service(check_cmd, timeout=10):
|
|
try:
|
|
proc = subprocess.run(check_cmd, shell=True, capture_output=True, timeout=timeout)
|
|
return proc.returncode == 0
|
|
except (subprocess.TimeoutExpired, subprocess.SubprocessError):
|
|
return False
|
|
|
|
|
|
def restart_service(restart_cmd, timeout=30):
|
|
try:
|
|
proc = subprocess.run(restart_cmd, shell=True, capture_output=True, timeout=timeout)
|
|
return proc.returncode == 0
|
|
except (subprocess.TimeoutExpired, subprocess.SubprocessError) as e:
|
|
return False
|
|
|
|
|
|
def try_restart_via_ssh(name, host_config, service_name):
|
|
ip = host_config["ip"]
|
|
user = host_config["user"]
|
|
service = host_config["services"][service_name]
|
|
|
|
restart_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} "{service["restart"]}"'
|
|
return restart_service(restart_cmd, timeout=30)
|
|
|
|
|
|
def log_restart(service_name, machine, attempt, success):
|
|
ts = datetime.now(timezone.utc).isoformat()
|
|
status = "SUCCESS" if success else "FAILED"
|
|
log_entry = f"{ts} [{status}] {machine}/{service_name} (attempt {attempt})\n"
|
|
|
|
RESTART_LOG.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(RESTART_LOG, "a") as f:
|
|
f.write(log_entry)
|
|
|
|
print(f" [{status}] {machine}/{service_name} - attempt {attempt}")
|
|
|
|
|
|
def check_and_restart():
|
|
"""Run all restart checks."""
|
|
results = []
|
|
cooldowns = get_cooldowns()
|
|
now = time.time()
|
|
|
|
# Check local services
|
|
for name, service in LOCAL_SERVICES.items():
|
|
if not check_service(service["check"]):
|
|
cooldown_key = f"local/{name}"
|
|
retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)
|
|
|
|
if retries >= MAX_RETRIES:
|
|
last = cooldowns.get(cooldown_key, {}).get("last", 0)
|
|
if now - last < COOLDOWN_PERIOD and service["critical"]:
|
|
send_telegram(f"CRITICAL: local/{name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
|
|
cooldowns[cooldown_key] = {"count": 0, "last": now}
|
|
save_cooldowns(cooldowns)
|
|
continue
|
|
|
|
success = restart_service(service["restart"])
|
|
log_restart(name, "local", retries + 1, success)
|
|
|
|
cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
|
|
save_cooldowns(cooldowns)
|
|
if success:
|
|
# Verify it actually started
|
|
time.sleep(3)
|
|
if check_service(service["check"]):
|
|
print(f" VERIFIED: local/{name} is running")
|
|
else:
|
|
print(f" WARNING: local/{name} restart command returned success but process not detected")
|
|
|
|
# Check VPS services
|
|
for host, host_config in VPS_SERVICES.items():
|
|
for service_name, service in host_config["services"].items():
|
|
check_cmd = f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host_config["user"]}@{host_config["ip"]} "{service["check"]}"'
|
|
if not check_service(check_cmd):
|
|
cooldown_key = f"{host}/{service_name}"
|
|
retries = cooldowns.get(cooldown_key, {"count": 0, "last": 0}).get("count", 0)
|
|
|
|
if retries >= MAX_RETRIES:
|
|
last = cooldowns.get(cooldown_key, {}).get("last", 0)
|
|
if now - last < COOLDOWN_PERIOD and service["critical"]:
|
|
send_telegram(f"CRITICAL: {host}/{service_name} failed {MAX_RETRIES} restart attempts. Needs human intervention.")
|
|
cooldowns[cooldown_key] = {"count": 0, "last": now}
|
|
save_cooldowns(cooldowns)
|
|
continue
|
|
|
|
success = try_restart_via_ssh(host, host_config, service_name)
|
|
log_restart(service_name, host, retries + 1, success)
|
|
|
|
cooldowns[cooldown_key] = {"count": retries + 1 if not success else 0, "last": now}
|
|
save_cooldowns(cooldowns)
|
|
|
|
return results
|
|
|
|
|
|
def daemon_mode():
|
|
"""Run continuously every 60 seconds."""
|
|
print("Auto-restart agent running in daemon mode (60s interval)")
|
|
print(f"Monitoring {len(LOCAL_SERVICES)} local + {sum(len(h['services']) for h in VPS_SERVICES.values())} remote services")
|
|
print(f"Max retries per cycle: {MAX_RETRIES}")
|
|
print(f"Cooldown after max retries: {COOLDOWN_PERIOD}s")
|
|
while True:
|
|
check_and_restart()
|
|
time.sleep(60)
|
|
|
|
|
|
def show_status():
|
|
"""Show restart history and cooldowns."""
|
|
cooldowns = get_cooldowns()
|
|
print("=== Restart Cooldowns ===")
|
|
for key, data in sorted(cooldowns.items()):
|
|
count = data.get("count", 0)
|
|
if count > 0:
|
|
print(f" {key}: {count} failures, last at {datetime.fromtimestamp(data.get('last',0), tz=timezone.utc).strftime('%H:%M')}")
|
|
|
|
print("\n=== Restart Log (last 20) ===")
|
|
if RESTART_LOG.exists():
|
|
lines = RESTART_LOG.read_text().strip().split("\n")
|
|
for line in lines[-20:]:
|
|
print(f" {line}")
|
|
else:
|
|
print(" No restarts logged yet.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == "--daemon":
|
|
daemon_mode()
|
|
elif len(sys.argv) > 1 and sys.argv[1] == "--status":
|
|
show_status()
|
|
else:
|
|
check_and_restart()
|