249 lines
9.4 KiB
Python
249 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ezra self-check / health monitoring script.
|
|
Checks all wizard infrastructure and reports status.
|
|
|
|
Epic: EZRA-SELF-001 / Phase 4 - Self-Monitoring
|
|
Author: Ezra (self-improvement)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import socket
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
|
|
class HealthCheck:
|
|
"""Run health checks on Ezra's infrastructure."""
|
|
|
|
def __init__(self):
|
|
self.results = []
|
|
self.start_time = time.time()
|
|
|
|
def check(self, name: str, fn, critical: bool = False) -> dict:
|
|
"""Run a single health check."""
|
|
try:
|
|
ok, detail = fn()
|
|
result = {
|
|
"name": name,
|
|
"status": "PASS" if ok else "FAIL",
|
|
"detail": detail,
|
|
"critical": critical,
|
|
}
|
|
except Exception as e:
|
|
result = {
|
|
"name": name,
|
|
"status": "ERROR",
|
|
"detail": str(e),
|
|
"critical": critical,
|
|
}
|
|
self.results.append(result)
|
|
return result
|
|
|
|
# === Individual checks ===
|
|
|
|
@staticmethod
|
|
def check_disk_space() -> tuple[bool, str]:
|
|
"""Check disk space (fail if < 2GB free)."""
|
|
st = os.statvfs("/")
|
|
free_gb = (st.f_bavail * st.f_frsize) / (1024 ** 3)
|
|
total_gb = (st.f_blocks * st.f_frsize) / (1024 ** 3)
|
|
pct_used = ((total_gb - free_gb) / total_gb) * 100
|
|
ok = free_gb > 2.0
|
|
return ok, f"{free_gb:.1f}GB free / {total_gb:.1f}GB total ({pct_used:.0f}% used)"
|
|
|
|
@staticmethod
|
|
def check_hermes_gateway() -> tuple[bool, str]:
|
|
"""Check if Hermes gateway is running for Ezra."""
|
|
pid_file = Path("/root/wizards/ezra/home/gateway.pid")
|
|
if not pid_file.exists():
|
|
return False, "No gateway.pid found"
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
os.kill(pid, 0) # Check if process exists
|
|
return True, f"Gateway running (PID {pid})"
|
|
except (ProcessLookupError, ValueError):
|
|
return False, f"Gateway PID file exists but process not running"
|
|
|
|
@staticmethod
|
|
def check_gitea_api() -> tuple[bool, str]:
|
|
"""Check Gitea API is reachable."""
|
|
import urllib.request
|
|
try:
|
|
req = urllib.request.Request(
|
|
"http://143.198.27.163:3000/api/v1/version",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
resp = urllib.request.urlopen(req, timeout=5)
|
|
data = json.loads(resp.read())
|
|
return True, f"Gitea {data.get('version', 'unknown')}"
|
|
except Exception as e:
|
|
return False, f"Gitea unreachable: {e}"
|
|
|
|
@staticmethod
|
|
def check_gitea_token() -> tuple[bool, str]:
|
|
"""Check Gitea token validity."""
|
|
token = os.getenv("GITEA_TOKEN", "")
|
|
if not token:
|
|
# Try loading from env file
|
|
env_file = Path("/root/wizards/ezra/home/.env")
|
|
if env_file.exists():
|
|
for line in env_file.read_text().splitlines():
|
|
if line.startswith("GITEA_TOKEN="):
|
|
token = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
break
|
|
if not token:
|
|
return False, "No GITEA_TOKEN found"
|
|
try:
|
|
import urllib.request
|
|
req = urllib.request.Request(
|
|
"http://143.198.27.163:3000/api/v1/user",
|
|
headers={"Authorization": f"token {token}", "Accept": "application/json"},
|
|
)
|
|
resp = urllib.request.urlopen(req, timeout=5)
|
|
data = json.loads(resp.read())
|
|
return True, f"Authenticated as {data.get('login', 'unknown')}"
|
|
except Exception as e:
|
|
return False, f"Token invalid: {e}"
|
|
|
|
@staticmethod
|
|
def check_llama_server(port: int = 11435) -> tuple[bool, str]:
|
|
"""Check if llama-server is running."""
|
|
try:
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
s.settimeout(3)
|
|
s.connect(("127.0.0.1", port))
|
|
s.close()
|
|
return True, f"llama-server listening on :{port}"
|
|
except Exception:
|
|
return False, f"llama-server not responding on :{port}"
|
|
|
|
@staticmethod
|
|
def check_memory_file() -> tuple[bool, str]:
|
|
"""Check Ezra's memory file exists and has content."""
|
|
mem = Path("/root/wizards/ezra/home/memories/MEMORY.md")
|
|
if not mem.exists():
|
|
return False, "MEMORY.md not found"
|
|
size = mem.stat().st_size
|
|
lines = len(mem.read_text().splitlines())
|
|
return True, f"MEMORY.md: {lines} lines, {size} bytes"
|
|
|
|
@staticmethod
|
|
def check_skills_count() -> tuple[bool, str]:
|
|
"""Count installed skills."""
|
|
skills_dir = Path("/root/wizards/ezra/home/skills")
|
|
if not skills_dir.exists():
|
|
return False, "Skills directory not found"
|
|
skills = []
|
|
for p in skills_dir.rglob("SKILL.md"):
|
|
skills.append(p.parent.name)
|
|
count = len(skills)
|
|
ok = count > 0
|
|
return ok, f"{count} skills installed"
|
|
|
|
@staticmethod
|
|
def check_cron_jobs() -> tuple[bool, str]:
|
|
"""Check cron jobs status."""
|
|
cron_file = Path("/root/wizards/ezra/home/cron/jobs.json")
|
|
if not cron_file.exists():
|
|
return False, "No cron jobs.json found"
|
|
try:
|
|
jobs = json.loads(cron_file.read_text())
|
|
active = sum(1 for j in jobs if j.get("status") == "active")
|
|
total = len(jobs)
|
|
return True, f"{active} active / {total} total cron jobs"
|
|
except Exception as e:
|
|
return False, f"Error reading jobs.json: {e}"
|
|
|
|
@staticmethod
|
|
def check_sessions_db() -> tuple[bool, str]:
|
|
"""Check sessions database."""
|
|
db_path = Path("/root/wizards/ezra/home/state.db")
|
|
if not db_path.exists():
|
|
return False, "state.db not found"
|
|
size_mb = db_path.stat().st_size / (1024 * 1024)
|
|
return True, f"state.db: {size_mb:.1f}MB"
|
|
|
|
@staticmethod
|
|
def check_backups() -> tuple[bool, str]:
|
|
"""Check backup freshness."""
|
|
backup_dir = Path("/root/wizards/ezra/backups")
|
|
if not backup_dir.exists():
|
|
return False, "No backups directory"
|
|
backups = sorted(backup_dir.glob("*.tar.gz"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
if not backups:
|
|
backups = sorted(backup_dir.glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
|
|
if not backups:
|
|
return False, "No backups found"
|
|
latest = backups[0]
|
|
age_hours = (time.time() - latest.stat().st_mtime) / 3600
|
|
return age_hours < 48, f"Latest: {latest.name} ({age_hours:.0f}h ago)"
|
|
|
|
# === Runner ===
|
|
|
|
def run_all(self) -> dict:
|
|
"""Run all health checks."""
|
|
self.check("Disk Space", self.check_disk_space, critical=True)
|
|
self.check("Hermes Gateway", self.check_hermes_gateway, critical=True)
|
|
self.check("Gitea API", self.check_gitea_api, critical=True)
|
|
self.check("Gitea Token", self.check_gitea_token, critical=True)
|
|
self.check("llama-server", self.check_llama_server, critical=False)
|
|
self.check("Memory File", self.check_memory_file, critical=False)
|
|
self.check("Skills", self.check_skills_count, critical=False)
|
|
self.check("Cron Jobs", self.check_cron_jobs, critical=False)
|
|
self.check("Sessions DB", self.check_sessions_db, critical=False)
|
|
self.check("Backups", self.check_backups, critical=False)
|
|
|
|
elapsed = time.time() - self.start_time
|
|
passed = sum(1 for r in self.results if r["status"] == "PASS")
|
|
failed = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR"))
|
|
crit_fail = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR") and r["critical"])
|
|
|
|
return {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"elapsed_seconds": round(elapsed, 2),
|
|
"total": len(self.results),
|
|
"passed": passed,
|
|
"failed": failed,
|
|
"critical_failures": crit_fail,
|
|
"healthy": crit_fail == 0,
|
|
"checks": self.results,
|
|
}
|
|
|
|
def format_report(self, result: dict = None) -> str:
|
|
"""Format health check results as markdown."""
|
|
if result is None:
|
|
result = self.run_all()
|
|
|
|
lines = [
|
|
f"# Ezra Health Check - {result['timestamp'][:19]}",
|
|
"",
|
|
f"**Status: {'HEALTHY' if result['healthy'] else 'UNHEALTHY'}** | "
|
|
f"{result['passed']}/{result['total']} passed | "
|
|
f"{result['elapsed_seconds']}s",
|
|
"",
|
|
"| Check | Status | Detail |",
|
|
"|-------|--------|--------|",
|
|
]
|
|
for c in result["checks"]:
|
|
icon = {"PASS": "✅", "FAIL": "❌", "ERROR": "⚠️"}.get(c["status"], "?")
|
|
crit = " 🔴" if c["critical"] and c["status"] != "PASS" else ""
|
|
lines.append(f"| {c['name']} | {icon} {c['status']}{crit} | {c['detail']} |")
|
|
|
|
if result["critical_failures"] > 0:
|
|
lines.extend(["", "## Critical Failures"])
|
|
for c in result["checks"]:
|
|
if c["critical"] and c["status"] != "PASS":
|
|
lines.append(f"- **{c['name']}**: {c['detail']}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
hc = HealthCheck()
|
|
report = hc.run_all()
|
|
print(hc.format_report(report))
|