ezra-environment/tools/health_check.py

#!/usr/bin/env python3
"""
Ezra self-check / health monitoring script.
Checks all wizard infrastructure and reports status.

Epic: EZRA-SELF-001 / Phase 4 - Self-Monitoring
Author: Ezra (self-improvement)
"""

import json
import os
import subprocess
import socket
import time
from datetime import datetime
from pathlib import Path


class HealthCheck:
    """Run health checks on Ezra's infrastructure."""

    def __init__(self):
        self.results = []
        self.start_time = time.time()

    def check(self, name: str, fn, critical: bool = False) -> dict:
        """Run a single health check."""
        try:
            ok, detail = fn()
            result = {
                "name": name,
                "status": "PASS" if ok else "FAIL",
                "detail": detail,
                "critical": critical,
            }
        except Exception as e:
            result = {
                "name": name,
                "status": "ERROR",
                "detail": str(e),
                "critical": critical,
            }
        self.results.append(result)
        return result

    # === Individual checks ===

    @staticmethod
    def check_disk_space() -> tuple[bool, str]:
        """Check disk space (fail if < 2GB free)."""
        st = os.statvfs("/")
        free_gb = (st.f_bavail * st.f_frsize) / (1024 ** 3)
        total_gb = (st.f_blocks * st.f_frsize) / (1024 ** 3)
        pct_used = ((total_gb - free_gb) / total_gb) * 100
        ok = free_gb > 2.0
        return ok, f"{free_gb:.1f}GB free / {total_gb:.1f}GB total ({pct_used:.0f}% used)"

    @staticmethod
    def check_hermes_gateway() -> tuple[bool, str]:
        """Check if Hermes gateway is running for Ezra."""
        pid_file = Path("/root/wizards/ezra/home/gateway.pid")
        if not pid_file.exists():
            return False, "No gateway.pid found"
        try:
            pid = int(pid_file.read_text().strip())
            os.kill(pid, 0)  # Check if process exists
            return True, f"Gateway running (PID {pid})"
        except (ProcessLookupError, ValueError):
            return False, f"Gateway PID file exists but process not running"

    @staticmethod
    def check_gitea_api() -> tuple[bool, str]:
        """Check Gitea API is reachable."""
        import urllib.request
        try:
            req = urllib.request.Request(
                "http://143.198.27.163:3000/api/v1/version",
                headers={"Accept": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=5)
            data = json.loads(resp.read())
            return True, f"Gitea {data.get('version', 'unknown')}"
        except Exception as e:
            return False, f"Gitea unreachable: {e}"

    @staticmethod
    def check_gitea_token() -> tuple[bool, str]:
        """Check Gitea token validity."""
        token = os.getenv("GITEA_TOKEN", "")
        if not token:
            # Try loading from env file
            env_file = Path("/root/wizards/ezra/home/.env")
            if env_file.exists():
                for line in env_file.read_text().splitlines():
                    if line.startswith("GITEA_TOKEN="):
                        token = line.split("=", 1)[1].strip().strip('"').strip("'")
                        break
        if not token:
            return False, "No GITEA_TOKEN found"
        try:
            import urllib.request
            req = urllib.request.Request(
                "http://143.198.27.163:3000/api/v1/user",
                headers={"Authorization": f"token {token}", "Accept": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=5)
            data = json.loads(resp.read())
            return True, f"Authenticated as {data.get('login', 'unknown')}"
        except Exception as e:
            return False, f"Token invalid: {e}"

    @staticmethod
    def check_llama_server(port: int = 11435) -> tuple[bool, str]:
        """Check if llama-server is running."""
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(3)
            s.connect(("127.0.0.1", port))
            s.close()
            return True, f"llama-server listening on :{port}"
        except Exception:
            return False, f"llama-server not responding on :{port}"

    @staticmethod
    def check_memory_file() -> tuple[bool, str]:
        """Check Ezra's memory file exists and has content."""
        mem = Path("/root/wizards/ezra/home/memories/MEMORY.md")
        if not mem.exists():
            return False, "MEMORY.md not found"
        size = mem.stat().st_size
        lines = len(mem.read_text().splitlines())
        return True, f"MEMORY.md: {lines} lines, {size} bytes"

    @staticmethod
    def check_skills_count() -> tuple[bool, str]:
        """Count installed skills."""
        skills_dir = Path("/root/wizards/ezra/home/skills")
        if not skills_dir.exists():
            return False, "Skills directory not found"
        skills = []
        for p in skills_dir.rglob("SKILL.md"):
            skills.append(p.parent.name)
        count = len(skills)
        ok = count > 0
        return ok, f"{count} skills installed"

    @staticmethod
    def check_cron_jobs() -> tuple[bool, str]:
        """Check cron jobs status."""
        cron_file = Path("/root/wizards/ezra/home/cron/jobs.json")
        if not cron_file.exists():
            return False, "No cron jobs.json found"
        try:
            jobs = json.loads(cron_file.read_text())
            active = sum(1 for j in jobs if j.get("status") == "active")
            total = len(jobs)
            return True, f"{active} active / {total} total cron jobs"
        except Exception as e:
            return False, f"Error reading jobs.json: {e}"

    @staticmethod
    def check_sessions_db() -> tuple[bool, str]:
        """Check sessions database."""
        db_path = Path("/root/wizards/ezra/home/state.db")
        if not db_path.exists():
            return False, "state.db not found"
        size_mb = db_path.stat().st_size / (1024 * 1024)
        return True, f"state.db: {size_mb:.1f}MB"

    @staticmethod
    def check_backups() -> tuple[bool, str]:
        """Check backup freshness."""
        backup_dir = Path("/root/wizards/ezra/backups")
        if not backup_dir.exists():
            return False, "No backups directory"
        backups = sorted(backup_dir.glob("*.tar.gz"), key=lambda p: p.stat().st_mtime, reverse=True)
        if not backups:
            backups = sorted(backup_dir.glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
        if not backups:
            return False, "No backups found"
        latest = backups[0]
        age_hours = (time.time() - latest.stat().st_mtime) / 3600
        return age_hours < 48, f"Latest: {latest.name} ({age_hours:.0f}h ago)"

    # === Runner ===

    def run_all(self) -> dict:
        """Run all health checks."""
        self.check("Disk Space", self.check_disk_space, critical=True)
        self.check("Hermes Gateway", self.check_hermes_gateway, critical=True)
        self.check("Gitea API", self.check_gitea_api, critical=True)
        self.check("Gitea Token", self.check_gitea_token, critical=True)
        self.check("llama-server", self.check_llama_server, critical=False)
        self.check("Memory File", self.check_memory_file, critical=False)
        self.check("Skills", self.check_skills_count, critical=False)
        self.check("Cron Jobs", self.check_cron_jobs, critical=False)
        self.check("Sessions DB", self.check_sessions_db, critical=False)
        self.check("Backups", self.check_backups, critical=False)

        elapsed = time.time() - self.start_time
        passed = sum(1 for r in self.results if r["status"] == "PASS")
        failed = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR"))
        crit_fail = sum(1 for r in self.results if r["status"] in ("FAIL", "ERROR") and r["critical"])

        return {
            "timestamp": datetime.now().isoformat(),
            "elapsed_seconds": round(elapsed, 2),
            "total": len(self.results),
            "passed": passed,
            "failed": failed,
            "critical_failures": crit_fail,
            "healthy": crit_fail == 0,
            "checks": self.results,
        }

    def format_report(self, result: dict = None) -> str:
        """Format health check results as markdown."""
        if result is None:
            result = self.run_all()

        lines = [
            f"# Ezra Health Check - {result['timestamp'][:19]}",
            "",
            f"**Status: {'HEALTHY' if result['healthy'] else 'UNHEALTHY'}** | "
            f"{result['passed']}/{result['total']} passed | "
            f"{result['elapsed_seconds']}s",
            "",
            "| Check | Status | Detail |",
            "|-------|--------|--------|",
        ]
        for c in result["checks"]:
            icon = {"PASS": "✅", "FAIL": "❌", "ERROR": "⚠️"}.get(c["status"], "?")
            crit = " 🔴" if c["critical"] and c["status"] != "PASS" else ""
            lines.append(f"| {c['name']} | {icon} {c['status']}{crit} | {c['detail']} |")

        if result["critical_failures"] > 0:
            lines.extend(["", "## Critical Failures"])
            for c in result["checks"]:
                if c["critical"] and c["status"] != "PASS":
                    lines.append(f"- **{c['name']}**: {c['detail']}")

        return "\n".join(lines)


if __name__ == "__main__":
    hc = HealthCheck()
    report = hc.run_all()
    print(hc.format_report(report))