#!/usr/bin/env python3 """ Fleet Health Check -- The Timmy Foundation Runs every 5 minutes via cron. Checks all machines, logs results, alerts via Telegram if something is down. Produces: - ~/.local/timmy/fleet-health/YYYY-MM-DD.log (per-day log) - ~/.local/timmy/fleet-health/uptime.json (running uptime stats) - Telegram alert if any check fails Usage: - python3 fleet_health.py # Run checks now - python3 fleet_health.py --init # Initialize log directory """ import os import sys import json import time import socket import subprocess from datetime import datetime, timezone from pathlib import Path # === CONFIG === HOSTS = { "ezra": { "ip": "143.198.27.163", "ssh_user": "root", "checks": ["ssh", "gitea"], "services": { "nginx": "systemctl is-active nginx", "gitea": "systemctl is-active gitea", "docker": "systemctl is-active docker", }, }, "allegro": { "ip": "167.99.126.228", "ssh_user": "root", "checks": ["ssh", "processes"], "services": { "hermes-agent": "pgrep -f hermes > /dev/null && echo active || echo inactive", }, }, "bezalel": { "ip": "159.203.146.185", "ssh_user": "root", "checks": ["ssh", "evennia"], "services": { "hermes-agent": "pgrep -f hermes > /dev/null 2>/dev/null && echo active || echo inactive", "evennia": "pgrep -f evennia > /dev/null 2>/dev/null && echo active || echo inactive", }, }, } LOCAL_CHECKS = { "hermes-gateway": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null", "hermes-agent": "pgrep -f 'hermes agent\\|hermes session' > /dev/null 2>/dev/null", "ollama": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null", "evennia": "pgrep -f 'evennia' > /dev/null 2>/dev/null", } LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health")) UPTIME_FILE = LOG_DIR / "uptime.json" TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot")) TELEGRAM_CHAT = "-1003664764329" LAST_ALERT_FILE = LOG_DIR / "last_alert.json" ALERT_COOLDOWN = 3600 # 1 hour between identical alerts def setup(): LOG_DIR.mkdir(parents=True, exist_ok=True) if not UPTIME_FILE.exists(): UPTIME_FILE.write_text(json.dumps({})) if not LAST_ALERT_FILE.exists(): LAST_ALERT_FILE.write_text(json.dumps({})) def check_ssh(host, ip, user="root", timeout=5): try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(timeout) result = sock.connect_ex((ip, 22)) sock.close() return result == 0, f"SSH port 22 {'open' if result == 0 else 'closed'}" except Exception as e: return False, f"SSH check failed: {e}" def check_remote_services(host_config, timeout=15): ip = host_config["ip"] user = host_config["ssh_user"] results = {} try: cmds = [] for name, cmd in host_config["services"].items(): cmds.append(f"echo '{name}: $({cmd})'") full_cmd = "; ".join(cmds) ssh_cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout={timeout} {user}@{ip} \"{full_cmd}\"" proc = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True, timeout=timeout + 5) if proc.returncode != 0: return {"error": f"SSH command failed: {proc.stderr.strip()[:200]}"} for line in proc.stdout.strip().split("\n"): if ":" in line: name, status = line.split(":", 1) results[name.strip()] = status.strip().lower() except subprocess.TimeoutExpired: return {"error": f"SSH timeout after {timeout}s"} except Exception as e: return {"error": str(e)} return results def check_local_processes(): results = {} for name, cmd in LOCAL_CHECKS.items(): try: proc = subprocess.run(cmd, shell=True, capture_output=True, timeout=5) results[name] = "active" if proc.returncode == 0 else "inactive" except Exception as e: results[name] = f"error: {e}" return results def check_disk_usage(ip=None, user="root"): if ip: cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} 'df -h / | tail -1'" else: cmd = "df -h / | tail -1" try: proc = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10) if proc.returncode == 0 and proc.stdout.strip(): parts = proc.stdout.strip().split() if len(parts) >= 5: return {"total": parts[1], "used": parts[2], "available": parts[3], "percent": parts[4]} return {"error": f"parse failed: {proc.stdout.strip()[:100]}"} return {"error": proc.stderr.strip()[:100] if proc.stderr else "empty response"} except Exception as e: return {"error": str(e)} def check_gitea(): import urllib.request try: req = urllib.request.Request("https://forge.alexanderwhitestone.com/api/v1/version") resp = urllib.request.urlopen(req, timeout=10) data = json.loads(resp.read()) return True, f"Gitea responding: {json.dumps(data)[:100]}" except Exception as e: return False, f"Gitea check failed: {e}" def send_alert(message): if not TELEGRAM_TOKEN_FILE.exists(): print(f" [ALERT - NO TELEGRAM TOKEN] {message}") return token = TELEGRAM_TOKEN_FILE.read_text().strip() url = f"https://api.telegram.org/bot{token}/sendMessage" body = json.dumps({ "chat_id": TELEGRAM_CHAT, "text": f"[FLEET ALERT]\n{message}", "parse_mode": "Markdown", }).encode() try: import urllib.request req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST") resp = urllib.request.urlopen(req, timeout=10) print(f" [ALERT SENT] {message}") return True except Exception as e: print(f" [ALERT FAILED] {message}: {e}") return False def check_alert_cooldown(alert_key): if LAST_ALERT_FILE.exists(): try: cooldowns = json.loads(LAST_ALERT_FILE.read_text()) last = cooldowns.get(alert_key, 0) if time.time() - last < ALERT_COOLDOWN: return False except (json.JSONDecodeError, KeyError): pass return True def record_alert(alert_key): cooldowns = {} if LAST_ALERT_FILE.exists(): try: cooldowns = json.loads(LAST_ALERT_FILE.read_text()) except json.JSONDecodeError: pass cooldowns[alert_key] = time.time() LAST_ALERT_FILE.write_text(json.dumps(cooldowns)) def run_checks(): now = datetime.now(timezone.utc) ts = now.strftime("%Y-%m-%d %H:%M:%S UTC") day_file = LOG_DIR / f"{now.strftime('%Y-%m-%d')}.log" results = { "timestamp": ts, "host": socket.gethostname(), "vps": {}, "local": {}, "alerts": [], } # Check Gitea gitea_ok, gitea_msg = check_gitea() if not gitea_ok: results["gitea"] = {"status": "DOWN", "message": gitea_msg} results["alerts"].append(f"Gitea DOWN: {gitea_msg}") else: results["gitea"] = {"status": "UP", "message": gitea_msg[:100]} # Check each VPS for name, config in HOSTS.items(): vps_result = {"timestamp": ts} ssh_ok, ssh_msg = check_ssh(name, config["ip"]) vps_result["ssh"] = {"ok": ssh_ok, "message": ssh_msg} if not ssh_ok: results["alerts"].append(f"{name.upper()} ({config['ip']}) SSH DOWN: {ssh_msg}") vps_result["disk"] = check_disk_usage(config["ip"], config["ssh_user"]) if ssh_ok: vps_result["services"] = check_remote_services(config) results["vps"][name] = vps_result # Check local processes results["local"]["processes"] = check_local_processes() results["local"]["disk"] = check_disk_usage() # Log results day_file.parent.mkdir(parents=True, exist_ok=True) with open(day_file, "a") as f: f.write(f"\n--- {ts} ---\n") for name, vps in results["vps"].items(): status = "UP" if vps["ssh"]["ok"] else "DOWN" f.write(f" {name}: {status}\n") if "services" in vps: for svc, svc_status in vps["services"].items(): f.write(f" {svc}: {svc_status}\n") for proc, status in results["local"]["processes"].items(): f.write(f" local/{proc}: {status}\n") # Update uptime stats uptime = {} if UPTIME_FILE.exists(): try: uptime = json.loads(UPTIME_FILE.read_text()) except json.JSONDecodeError: pass if "checks" not in uptime: uptime["checks"] = [] uptime["checks"].append({ "ts": ts, "vps": {name: vps["ssh"]["ok"] for name, vps in results["vps"].items()}, "gitea": results.get("gitea", {}).get("status") == "UP", "local": {k: v == "active" for k, v in results["local"]["processes"].items()} }) if len(uptime["checks"]) > 1000: uptime["checks"] = uptime["checks"][-1000:] UPTIME_FILE.write_text(json.dumps(uptime, indent=2)) # Send alerts for alert in results["alerts"]: alert_key = alert[:80] if check_alert_cooldown(alert_key): send_alert(alert) record_alert(alert_key) # Summary up_vps = sum(1 for v in results["vps"].values() if v["ssh"]["ok"]) total_vps = len(results["vps"]) up_local = sum(1 for v in results["local"]["processes"].values() if v == "active") total_local = len(results["local"]["processes"]) alert_count = len(results["alerts"]) print(f"\n=== Fleet Health Check {ts} ===") print(f" VPS: {up_vps}/{total_vps} online") print(f" Local: {up_local}/{total_local} active") print(f" Gitea: {'UP' if results.get('gitea', {}).get('status') == 'UP' else 'DOWN'}") if alert_count > 0: print(f" ALERTS: {alert_count}") for a in results["alerts"]: print(f" - {a}") else: print(f" All clear.") return results if __name__ == "__main__": setup() run_checks()