Files
ezra-environment/tools/health_check.py

249 lines
9.4 KiB
Python

#!/usr/bin/env python3
"""
Ezra self-check / health monitoring script.
Checks all wizard infrastructure and reports status.
Epic: EZRA-SELF-001 / Phase 4 - Self-Monitoring
Author: Ezra (self-improvement)
"""
import json
import os
import subprocess
import socket
import time
from datetime import datetime
from pathlib import Path
class HealthCheck:
"""Run health checks on Ezra's infrastructure."""
def __init__(self):
self.results = []
self.start_time = time.time()
def check(self, name: str, fn, critical: bool = False) -> dict:
"""Run a single health check."""
try:
ok, detail = fn()
result = {
"name": name,
"status": "PASS" if ok else "FAIL",
"detail": detail,
"critical": critical,
}
except Exception as e:
result = {
"name": name,
"status": "ERROR",
"detail": str(e),
"critical": critical,
}
self.results.append(result)
return result
# === Individual checks ===
@staticmethod
def check_disk_space() -> tuple[bool, str]:
"""Check disk space (fail if < 2GB free)."""
st = os.statvfs("/")
free_gb = (st.f_bavail * st.f_frsize) / (1024 ** 3)
total_gb = (st.f_blocks * st.f_frsize) / (1024 ** 3)
pct_used = ((total_gb - free_gb) / total_gb) * 100
ok = free_gb > 2.0
return ok, f"{free_gb:.1f}GB free / {total_gb:.1f}GB total ({pct_used:.0f}% used)"
@staticmethod
def check_hermes_gateway() -> tuple[bool, str]:
"""Check if Hermes gateway is running for Ezra."""
pid_file = Path("/root/wizards/ezra/home/gateway.pid")
if not pid_file.exists():
return False, "No gateway.pid found"
try:
pid = int(pid_file.read_text().strip())
os.kill(pid, 0) # Check if process exists
return True, f"Gateway running (PID {pid})"
except (ProcessLookupError, ValueError):
return False, f"Gateway PID file exists but process not running"
@staticmethod
def check_gitea_api() -> tuple[bool, str]:
"""Check Gitea API is reachable."""
import urllib.request
try:
req = urllib.request.Request(
"http://143.198.27.163:3000/api/v1/version",
headers={"Accept": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
return True, f"Gitea {data.get('version', 'unknown')}"
except Exception as e:
return False, f"Gitea unreachable: {e}"
@staticmethod
def check_gitea_token() -> tuple[bool, str]:
"""Check Gitea token validity."""
token = os.getenv("GITEA_TOKEN", "")
if not token:
# Try loading from env file
env_file = Path("/root/wizards/ezra/home/.env")
if env_file.exists():
for line in env_file.read_text().splitlines():
if line.startswith("GITEA_TOKEN="):
token = line.split("=", 1)[1].strip().strip('"').strip("'")
break
if not token:
return False, "No GITEA_TOKEN found"
try:
import urllib.request
req = urllib.request.Request(
"http://143.198.27.163:3000/api/v1/user",
headers={"Authorization": f"token {token}", "Accept": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
return True, f"Authenticated as {data.get('login', 'unknown')}"
except Exception as e:
return False, f"Token invalid: {e}"
@staticmethod
def check_llama_server(port: int = 11435) -> tuple[bool, str]:
"""Check if llama-server is running."""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(3)
s.connect(("127.0.0.1", port))
s.close()
return True, f"llama-server listening on :{port}"
except Exception:
return False, f"llama-server not responding on :{port}"
@staticmethod
def check_memory_file() -> tuple[bool, str]:
"""Check Ezra's memory file exists and has content."""
mem = Path("/root/wizards/ezra/home/memories/MEMORY.md")
if not mem.exists():
return False, "MEMORY.md not found"
size = mem.stat().st_size
lines = len(mem.read_text().splitlines())
return True, f"MEMORY.md: {lines} lines, {size} bytes"
@staticmethod
def check_skills_count() -> tuple[bool, str]:
"""Count installed skills."""
skills_dir = Path("/root/wizards/ezra/home/skills")
if not skills_dir.exists():
return False, "Skills directory not found"
skills = []
for p in skills_dir.rglob("SKILL.md"):
skills.append(p.parent.name)
count = len(skills)
ok = count > 0
return ok, f"{count} skills installed"
@staticmethod
def check_cron_jobs() -> tuple[bool, str]:
"""Check cron jobs status."""
cron_file = Path("/root/wizards/ezra/home/cron/jobs.json")
if not cron_file.exists():
return False, "No cron jobs.json found"
try:
jobs = json.loads(cron_file.read_text())
active = sum(1 for j in jobs if j.get("status") == "active")
total = len(jobs)
return True, f"{active} active / {total} total cron jobs"
except Exception as e:
return False, f"Error reading jobs.json: {e}"
@staticmethod
def check_sessions_db() -> tuple[bool, str]:
"""Check sessions database."""
db_path = Path("/root/wizards/ezra/home/state.db")
if not db_path.exists():
return False, "state.db not found"
size_mb = db_path.stat().st_size / (1024 * 1024)
return True, f"state.db: {size_mb:.1f}MB"
@staticmethod
def check_backups() -> tuple[bool, str]:
"""Check backup freshness."""
backup_dir = Path("/root/wizards/ezra/backups")
if not backup_dir.exists():
return False, "No backups directory"
backups = sorted(backup_dir.glob("*.tar.gz"), key=lambda p: p.stat().st_mtime, reverse=True)
if not backups:
backups = sorted(backup_dir.glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
if not backups:
return False, "No backups found"
latest = backups[0]
age_hours = (time.time() - latest.stat().st_mtime) / 3600
return age_hours < 48, f"Latest: {latest.name} ({age_hours:.0f}h ago)"
# === Runner ===
def run_all(self) -> dict:
"""Run all health checks."""
self.check("Disk Space", self.check_disk_space, critical=True)
self.check("Hermes Gateway", self.check_hermes_gateway, critical=True)
self.check("Gitea API", self.check_gitea_api, critical=True)
self.check("Gitea Token", self.check_gitea_token, critical=True)
self.check("llama-server", self.check_llama_server, critical=False)
self.check("Memory File", self.check_memory_file, critical=False)
self.check("Skills", self.check_skills_count, critical=False)
self.check("Cron Jobs", self.check_cron_jobs, critical=False)
self.check("Sessions DB", self.check_sessions_db, critical=False)
self.check("Backups", self.check_backups, critical=False)
elapsed = time.time() - self.start_time
passed = sum(1 for r in self.results if r["status"] == "PASS")
failed = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR"))
crit_fail = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR") and r["critical"])
return {
"timestamp": datetime.now().isoformat(),
"elapsed_seconds": round(elapsed, 2),
"total": len(self.results),
"passed": passed,
"failed": failed,
"critical_failures": crit_fail,
"healthy": crit_fail == 0,
"checks": self.results,
}
def format_report(self, result: dict = None) -> str:
"""Format health check results as markdown."""
if result is None:
result = self.run_all()
lines = [
f"# Ezra Health Check - {result['timestamp'][:19]}",
"",
f"**Status: {'HEALTHY' if result['healthy'] else 'UNHEALTHY'}** | "
f"{result['passed']}/{result['total']} passed | "
f"{result['elapsed_seconds']}s",
"",
"| Check | Status | Detail |",
"|-------|--------|--------|",
]
for c in result["checks"]:
icon = {"PASS": "", "FAIL": "", "ERROR": "⚠️"}.get(c["status"], "?")
crit = " 🔴" if c["critical"] and c["status"] != "PASS" else ""
lines.append(f"| {c['name']} | {icon} {c['status']}{crit} | {c['detail']} |")
if result["critical_failures"] > 0:
lines.extend(["", "## Critical Failures"])
for c in result["checks"]:
if c["critical"] and c["status"] != "PASS":
lines.append(f"- **{c['name']}**: {c['detail']}")
return "\n".join(lines)
if __name__ == "__main__":
hc = HealthCheck()
report = hc.run_all()
print(hc.format_report(report))