5-minute health monitoring for all 4 machines + Gitea: - SSH connectivity check (socket-based, instant) - Service check via SSH (nginx, gitea, hermes-agent, evennia) - Disk usage check on all machines - Local process check (hermes, ollama, openclaw, evennia) - Telegram alert with 1-hour cooldown per alert - Running uptime stats saved to ~/.local/timmy/fleet-health/uptime.json - Per-day log files Fixes timmy-home#555, FLEET-006
300 lines
10 KiB
Python
Executable File
300 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Fleet Health Check -- The Timmy Foundation
|
|
Runs every 5 minutes via cron. Checks all machines, logs results,
|
|
alerts via Telegram if something is down.
|
|
|
|
Produces:
|
|
- ~/.local/timmy/fleet-health/YYYY-MM-DD.log (per-day log)
|
|
- ~/.local/timmy/fleet-health/uptime.json (running uptime stats)
|
|
- Telegram alert if any check fails
|
|
|
|
Usage:
|
|
- python3 fleet_health.py # Run checks now
|
|
- python3 fleet_health.py --init # Initialize log directory
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import socket
|
|
import subprocess
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# === CONFIG ===
|
|
HOSTS = {
|
|
"ezra": {
|
|
"ip": "143.198.27.163",
|
|
"ssh_user": "root",
|
|
"checks": ["ssh", "gitea"],
|
|
"services": {
|
|
"nginx": "systemctl is-active nginx",
|
|
"gitea": "systemctl is-active gitea",
|
|
"docker": "systemctl is-active docker",
|
|
},
|
|
},
|
|
"allegro": {
|
|
"ip": "167.99.126.228",
|
|
"ssh_user": "root",
|
|
"checks": ["ssh", "processes"],
|
|
"services": {
|
|
"hermes-agent": "pgrep -f hermes > /dev/null && echo active || echo inactive",
|
|
},
|
|
},
|
|
"bezalel": {
|
|
"ip": "159.203.146.185",
|
|
"ssh_user": "root",
|
|
"checks": ["ssh", "evennia"],
|
|
"services": {
|
|
"hermes-agent": "pgrep -f hermes > /dev/null 2>/dev/null && echo active || echo inactive",
|
|
"evennia": "pgrep -f evennia > /dev/null 2>/dev/null && echo active || echo inactive",
|
|
},
|
|
},
|
|
}
|
|
|
|
LOCAL_CHECKS = {
|
|
"hermes-gateway": "pgrep -f 'hermes gateway' > /dev/null 2>/dev/null",
|
|
"hermes-agent": "pgrep -f 'hermes agent\\|hermes session' > /dev/null 2>/dev/null",
|
|
"ollama": "pgrep -f 'ollama serve' > /dev/null 2>/dev/null",
|
|
"openclaw": "pgrep -f 'openclaw' > /dev/null 2>/dev/null",
|
|
"evennia": "pgrep -f 'evennia' > /dev/null 2>/dev/null",
|
|
}
|
|
|
|
LOG_DIR = Path(os.path.expanduser("~/.local/timmy/fleet-health"))
|
|
UPTIME_FILE = LOG_DIR / "uptime.json"
|
|
TELEGRAM_TOKEN_FILE = Path(os.path.expanduser("~/.config/telegram/special_bot"))
|
|
TELEGRAM_CHAT = "-1003664764329"
|
|
LAST_ALERT_FILE = LOG_DIR / "last_alert.json"
|
|
ALERT_COOLDOWN = 3600 # 1 hour between identical alerts
|
|
|
|
|
|
def setup():
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
if not UPTIME_FILE.exists():
|
|
UPTIME_FILE.write_text(json.dumps({}))
|
|
if not LAST_ALERT_FILE.exists():
|
|
LAST_ALERT_FILE.write_text(json.dumps({}))
|
|
|
|
|
|
def check_ssh(host, ip, user="root", timeout=5):
|
|
try:
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
sock.settimeout(timeout)
|
|
result = sock.connect_ex((ip, 22))
|
|
sock.close()
|
|
return result == 0, f"SSH port 22 {'open' if result == 0 else 'closed'}"
|
|
except Exception as e:
|
|
return False, f"SSH check failed: {e}"
|
|
|
|
|
|
def check_remote_services(host_config, timeout=15):
|
|
ip = host_config["ip"]
|
|
user = host_config["ssh_user"]
|
|
results = {}
|
|
try:
|
|
cmds = []
|
|
for name, cmd in host_config["services"].items():
|
|
cmds.append(f"echo '{name}: $({cmd})'")
|
|
full_cmd = "; ".join(cmds)
|
|
ssh_cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout={timeout} {user}@{ip} \"{full_cmd}\""
|
|
proc = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True, timeout=timeout + 5)
|
|
if proc.returncode != 0:
|
|
return {"error": f"SSH command failed: {proc.stderr.strip()[:200]}"}
|
|
for line in proc.stdout.strip().split("\n"):
|
|
if ":" in line:
|
|
name, status = line.split(":", 1)
|
|
results[name.strip()] = status.strip().lower()
|
|
except subprocess.TimeoutExpired:
|
|
return {"error": f"SSH timeout after {timeout}s"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
return results
|
|
|
|
|
|
def check_local_processes():
|
|
results = {}
|
|
for name, cmd in LOCAL_CHECKS.items():
|
|
try:
|
|
proc = subprocess.run(cmd, shell=True, capture_output=True, timeout=5)
|
|
results[name] = "active" if proc.returncode == 0 else "inactive"
|
|
except Exception as e:
|
|
results[name] = f"error: {e}"
|
|
return results
|
|
|
|
|
|
def check_disk_usage(ip=None, user="root"):
|
|
if ip:
|
|
cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 {user}@{ip} 'df -h / | tail -1'"
|
|
else:
|
|
cmd = "df -h / | tail -1"
|
|
try:
|
|
proc = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
|
if proc.returncode == 0 and proc.stdout.strip():
|
|
parts = proc.stdout.strip().split()
|
|
if len(parts) >= 5:
|
|
return {"total": parts[1], "used": parts[2], "available": parts[3], "percent": parts[4]}
|
|
return {"error": f"parse failed: {proc.stdout.strip()[:100]}"}
|
|
return {"error": proc.stderr.strip()[:100] if proc.stderr else "empty response"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
|
|
def check_gitea():
|
|
import urllib.request
|
|
try:
|
|
req = urllib.request.Request("https://forge.alexanderwhitestone.com/api/v1/version")
|
|
resp = urllib.request.urlopen(req, timeout=10)
|
|
data = json.loads(resp.read())
|
|
return True, f"Gitea responding: {json.dumps(data)[:100]}"
|
|
except Exception as e:
|
|
return False, f"Gitea check failed: {e}"
|
|
|
|
|
|
def send_alert(message):
|
|
if not TELEGRAM_TOKEN_FILE.exists():
|
|
print(f" [ALERT - NO TELEGRAM TOKEN] {message}")
|
|
return
|
|
token = TELEGRAM_TOKEN_FILE.read_text().strip()
|
|
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
|
body = json.dumps({
|
|
"chat_id": TELEGRAM_CHAT,
|
|
"text": f"[FLEET ALERT]\n{message}",
|
|
"parse_mode": "Markdown",
|
|
}).encode()
|
|
try:
|
|
import urllib.request
|
|
req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
|
|
resp = urllib.request.urlopen(req, timeout=10)
|
|
print(f" [ALERT SENT] {message}")
|
|
return True
|
|
except Exception as e:
|
|
print(f" [ALERT FAILED] {message}: {e}")
|
|
return False
|
|
|
|
|
|
def check_alert_cooldown(alert_key):
|
|
if LAST_ALERT_FILE.exists():
|
|
try:
|
|
cooldowns = json.loads(LAST_ALERT_FILE.read_text())
|
|
last = cooldowns.get(alert_key, 0)
|
|
if time.time() - last < ALERT_COOLDOWN:
|
|
return False
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
return True
|
|
|
|
|
|
def record_alert(alert_key):
|
|
cooldowns = {}
|
|
if LAST_ALERT_FILE.exists():
|
|
try:
|
|
cooldowns = json.loads(LAST_ALERT_FILE.read_text())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
cooldowns[alert_key] = time.time()
|
|
LAST_ALERT_FILE.write_text(json.dumps(cooldowns))
|
|
|
|
|
|
def run_checks():
|
|
now = datetime.now(timezone.utc)
|
|
ts = now.strftime("%Y-%m-%d %H:%M:%S UTC")
|
|
day_file = LOG_DIR / f"{now.strftime('%Y-%m-%d')}.log"
|
|
|
|
results = {
|
|
"timestamp": ts,
|
|
"host": socket.gethostname(),
|
|
"vps": {},
|
|
"local": {},
|
|
"alerts": [],
|
|
}
|
|
|
|
# Check Gitea
|
|
gitea_ok, gitea_msg = check_gitea()
|
|
if not gitea_ok:
|
|
results["gitea"] = {"status": "DOWN", "message": gitea_msg}
|
|
results["alerts"].append(f"Gitea DOWN: {gitea_msg}")
|
|
else:
|
|
results["gitea"] = {"status": "UP", "message": gitea_msg[:100]}
|
|
|
|
# Check each VPS
|
|
for name, config in HOSTS.items():
|
|
vps_result = {"timestamp": ts}
|
|
ssh_ok, ssh_msg = check_ssh(name, config["ip"])
|
|
vps_result["ssh"] = {"ok": ssh_ok, "message": ssh_msg}
|
|
if not ssh_ok:
|
|
results["alerts"].append(f"{name.upper()} ({config['ip']}) SSH DOWN: {ssh_msg}")
|
|
vps_result["disk"] = check_disk_usage(config["ip"], config["ssh_user"])
|
|
if ssh_ok:
|
|
vps_result["services"] = check_remote_services(config)
|
|
results["vps"][name] = vps_result
|
|
|
|
# Check local processes
|
|
results["local"]["processes"] = check_local_processes()
|
|
results["local"]["disk"] = check_disk_usage()
|
|
|
|
# Log results
|
|
day_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(day_file, "a") as f:
|
|
f.write(f"\n--- {ts} ---\n")
|
|
for name, vps in results["vps"].items():
|
|
status = "UP" if vps["ssh"]["ok"] else "DOWN"
|
|
f.write(f" {name}: {status}\n")
|
|
if "services" in vps:
|
|
for svc, svc_status in vps["services"].items():
|
|
f.write(f" {svc}: {svc_status}\n")
|
|
for proc, status in results["local"]["processes"].items():
|
|
f.write(f" local/{proc}: {status}\n")
|
|
|
|
# Update uptime stats
|
|
uptime = {}
|
|
if UPTIME_FILE.exists():
|
|
try:
|
|
uptime = json.loads(UPTIME_FILE.read_text())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
if "checks" not in uptime:
|
|
uptime["checks"] = []
|
|
uptime["checks"].append({
|
|
"ts": ts,
|
|
"vps": {name: vps["ssh"]["ok"] for name, vps in results["vps"].items()},
|
|
"gitea": results.get("gitea", {}).get("status") == "UP",
|
|
"local": {k: v == "active" for k, v in results["local"]["processes"].items()}
|
|
})
|
|
if len(uptime["checks"]) > 1000:
|
|
uptime["checks"] = uptime["checks"][-1000:]
|
|
UPTIME_FILE.write_text(json.dumps(uptime, indent=2))
|
|
|
|
# Send alerts
|
|
for alert in results["alerts"]:
|
|
alert_key = alert[:80]
|
|
if check_alert_cooldown(alert_key):
|
|
send_alert(alert)
|
|
record_alert(alert_key)
|
|
|
|
# Summary
|
|
up_vps = sum(1 for v in results["vps"].values() if v["ssh"]["ok"])
|
|
total_vps = len(results["vps"])
|
|
up_local = sum(1 for v in results["local"]["processes"].values() if v == "active")
|
|
total_local = len(results["local"]["processes"])
|
|
alert_count = len(results["alerts"])
|
|
|
|
print(f"\n=== Fleet Health Check {ts} ===")
|
|
print(f" VPS: {up_vps}/{total_vps} online")
|
|
print(f" Local: {up_local}/{total_local} active")
|
|
print(f" Gitea: {'UP' if results.get('gitea', {}).get('status') == 'UP' else 'DOWN'}")
|
|
if alert_count > 0:
|
|
print(f" ALERTS: {alert_count}")
|
|
for a in results["alerts"]:
|
|
print(f" - {a}")
|
|
else:
|
|
print(f" All clear.")
|
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
setup()
|
|
run_checks()
|