hermes-agent/devkit/health.py

#!/usr/bin/env python3
"""
Fleet health monitor for wizard agents.
Checks local system state and reports structured health metrics.

Usage as CLI:
    python -m devkit.health
    python -m devkit.health --threshold-load 1.0 --check-disk

Usage as module:
    from devkit.health import check_health
    report = check_health()
"""

import argparse
import json
import os
import shutil
import subprocess
import sys
import time
from typing import Any, Dict, List


def _run(cmd: List[str]) -> str:
    try:
        return subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
    except Exception as e:
        return f"error: {e}"


def check_health(threshold_load: float = 1.0, threshold_disk_percent: float = 90.0) -> Dict[str, Any]:
    gather_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

    # Load average
    load_raw = _run(["cat", "/proc/loadavg"])
    load_values = []
    avg_load = None
    if load_raw.startswith("error:"):
        load_status = load_raw
    else:
        try:
            load_values = [float(x) for x in load_raw.split()[:3]]
            avg_load = sum(load_values) / len(load_values)
            load_status = "critical" if avg_load > threshold_load else "ok"
        except Exception as e:
            load_status = f"error parsing load: {e}"

    # Disk usage
    disk = shutil.disk_usage("/")
    disk_percent = (disk.used / disk.total) * 100 if disk.total else 0.0
    disk_status = "critical" if disk_percent > threshold_disk_percent else "ok"

    # Memory
    meminfo = _run(["cat", "/proc/meminfo"])
    mem_stats = {}
    for line in meminfo.splitlines():
        if ":" in line:
            key, val = line.split(":", 1)
            mem_stats[key.strip()] = val.strip()

    # Running processes
    hermes_pids = []
    try:
        ps_out = subprocess.check_output(["pgrep", "-a", "-f", "hermes"]).decode().strip()
        hermes_pids = [line.split(None, 1) for line in ps_out.splitlines() if line.strip()]
    except subprocess.CalledProcessError:
        hermes_pids = []

    # Python package versions (key ones)
    key_packages = ["jupyterlab", "papermill", "requests"]
    pkg_versions = {}
    for pkg in key_packages:
        try:
            out = subprocess.check_output([sys.executable, "-m", "pip", "show", pkg], stderr=subprocess.DEVNULL).decode()
            for line in out.splitlines():
                if line.startswith("Version:"):
                    pkg_versions[pkg] = line.split(":", 1)[1].strip()
                    break
        except Exception:
            pkg_versions[pkg] = None

    overall = "ok"
    if load_status == "critical" or disk_status == "critical":
        overall = "critical"
    elif not hermes_pids:
        overall = "warning"

    return {
        "timestamp": gather_time,
        "overall": overall,
        "load": {
            "raw": load_raw if not load_raw.startswith("error:") else None,
            "1min": load_values[0] if len(load_values) > 0 else None,
            "5min": load_values[1] if len(load_values) > 1 else None,
            "15min": load_values[2] if len(load_values) > 2 else None,
            "avg": round(avg_load, 3) if avg_load is not None else None,
            "threshold": threshold_load,
            "status": load_status,
        },
        "disk": {
            "total_gb": round(disk.total / (1024 ** 3), 2),
            "used_gb": round(disk.used / (1024 ** 3), 2),
            "free_gb": round(disk.free / (1024 ** 3), 2),
            "used_percent": round(disk_percent, 2),
            "threshold_percent": threshold_disk_percent,
            "status": disk_status,
        },
        "memory": mem_stats,
        "processes": {
            "hermes_count": len(hermes_pids),
            "hermes_pids": hermes_pids[:10],
        },
        "packages": pkg_versions,
    }


def main(argv: List[str] = None) -> int:
    argv = argv or sys.argv[1:]
    parser = argparse.ArgumentParser(description="Fleet health monitor")
    parser.add_argument("--threshold-load", type=float, default=1.0)
    parser.add_argument("--threshold-disk", type=float, default=90.0)
    parser.add_argument("--fail-on-critical", action="store_true", help="Exit non-zero if overall is critical")
    args = parser.parse_args(argv)

    report = check_health(args.threshold_load, args.threshold_disk)
    print(json.dumps(report, indent=2))
    if args.fail_on_critical and report.get("overall") == "critical":
        return 1
    return 0


if __name__ == "__main__":
    sys.exit(main())