#!/usr/bin/env python3 """ Fleet health monitor for wizard agents. Checks local system state and reports structured health metrics. Usage as CLI: python -m devkit.health python -m devkit.health --threshold-load 1.0 --check-disk Usage as module: from devkit.health import check_health report = check_health() """ import argparse import json import os import shutil import subprocess import sys import time from typing import Any, Dict, List def _run(cmd: List[str]) -> str: try: return subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip() except Exception as e: return f"error: {e}" def check_health(threshold_load: float = 1.0, threshold_disk_percent: float = 90.0) -> Dict[str, Any]: gather_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) # Load average load_raw = _run(["cat", "/proc/loadavg"]) load_values = [] avg_load = None if load_raw.startswith("error:"): load_status = load_raw else: try: load_values = [float(x) for x in load_raw.split()[:3]] avg_load = sum(load_values) / len(load_values) load_status = "critical" if avg_load > threshold_load else "ok" except Exception as e: load_status = f"error parsing load: {e}" # Disk usage disk = shutil.disk_usage("/") disk_percent = (disk.used / disk.total) * 100 if disk.total else 0.0 disk_status = "critical" if disk_percent > threshold_disk_percent else "ok" # Memory meminfo = _run(["cat", "/proc/meminfo"]) mem_stats = {} for line in meminfo.splitlines(): if ":" in line: key, val = line.split(":", 1) mem_stats[key.strip()] = val.strip() # Running processes hermes_pids = [] try: ps_out = subprocess.check_output(["pgrep", "-a", "-f", "hermes"]).decode().strip() hermes_pids = [line.split(None, 1) for line in ps_out.splitlines() if line.strip()] except subprocess.CalledProcessError: hermes_pids = [] # Python package versions (key ones) key_packages = ["jupyterlab", "papermill", "requests"] pkg_versions = {} for pkg in key_packages: try: out = subprocess.check_output([sys.executable, "-m", "pip", "show", pkg], stderr=subprocess.DEVNULL).decode() for line in out.splitlines(): if line.startswith("Version:"): pkg_versions[pkg] = line.split(":", 1)[1].strip() break except Exception: pkg_versions[pkg] = None overall = "ok" if load_status == "critical" or disk_status == "critical": overall = "critical" elif not hermes_pids: overall = "warning" return { "timestamp": gather_time, "overall": overall, "load": { "raw": load_raw if not load_raw.startswith("error:") else None, "1min": load_values[0] if len(load_values) > 0 else None, "5min": load_values[1] if len(load_values) > 1 else None, "15min": load_values[2] if len(load_values) > 2 else None, "avg": round(avg_load, 3) if avg_load is not None else None, "threshold": threshold_load, "status": load_status, }, "disk": { "total_gb": round(disk.total / (1024 ** 3), 2), "used_gb": round(disk.used / (1024 ** 3), 2), "free_gb": round(disk.free / (1024 ** 3), 2), "used_percent": round(disk_percent, 2), "threshold_percent": threshold_disk_percent, "status": disk_status, }, "memory": mem_stats, "processes": { "hermes_count": len(hermes_pids), "hermes_pids": hermes_pids[:10], }, "packages": pkg_versions, } def main(argv: List[str] = None) -> int: argv = argv or sys.argv[1:] parser = argparse.ArgumentParser(description="Fleet health monitor") parser.add_argument("--threshold-load", type=float, default=1.0) parser.add_argument("--threshold-disk", type=float, default=90.0) parser.add_argument("--fail-on-critical", action="store_true", help="Exit non-zero if overall is critical") args = parser.parse_args(argv) report = check_health(args.threshold_load, args.threshold_disk) print(json.dumps(report, indent=2)) if args.fail_on_critical and report.get("overall") == "critical": return 1 return 0 if __name__ == "__main__": sys.exit(main())