Some checks failed
Notebook CI / notebook-smoke (pull_request) Failing after 2s
- gitea_client.py — reusable Gitea API client for issues, PRs, comments - health.py — fleet health monitor (load, disk, memory, processes) - notebook_runner.py — Papermill wrapper with JSON reporting - smoke_test.py — fast smoke tests and bare green-path e2e - secret_scan.py — secret leak scanner for CI gating - wizard_env.py — environment validator for bootstrapping agents - README.md — usage guide for all tools These tools are designed to be used by any wizard via python -m devkit.<tool>. Rising up as a platform, not a silo.
135 lines
4.4 KiB
Python
135 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fleet health monitor for wizard agents.
|
|
Checks local system state and reports structured health metrics.
|
|
|
|
Usage as CLI:
|
|
python -m devkit.health
|
|
python -m devkit.health --threshold-load 1.0 --check-disk
|
|
|
|
Usage as module:
|
|
from devkit.health import check_health
|
|
report = check_health()
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
def _run(cmd: List[str]) -> str:
|
|
try:
|
|
return subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
|
|
except Exception as e:
|
|
return f"error: {e}"
|
|
|
|
|
|
def check_health(threshold_load: float = 1.0, threshold_disk_percent: float = 90.0) -> Dict[str, Any]:
|
|
gather_time = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
|
|
# Load average
|
|
load_raw = _run(["cat", "/proc/loadavg"])
|
|
load_values = []
|
|
avg_load = None
|
|
if load_raw.startswith("error:"):
|
|
load_status = load_raw
|
|
else:
|
|
try:
|
|
load_values = [float(x) for x in load_raw.split()[:3]]
|
|
avg_load = sum(load_values) / len(load_values)
|
|
load_status = "critical" if avg_load > threshold_load else "ok"
|
|
except Exception as e:
|
|
load_status = f"error parsing load: {e}"
|
|
|
|
# Disk usage
|
|
disk = shutil.disk_usage("/")
|
|
disk_percent = (disk.used / disk.total) * 100 if disk.total else 0.0
|
|
disk_status = "critical" if disk_percent > threshold_disk_percent else "ok"
|
|
|
|
# Memory
|
|
meminfo = _run(["cat", "/proc/meminfo"])
|
|
mem_stats = {}
|
|
for line in meminfo.splitlines():
|
|
if ":" in line:
|
|
key, val = line.split(":", 1)
|
|
mem_stats[key.strip()] = val.strip()
|
|
|
|
# Running processes
|
|
hermes_pids = []
|
|
try:
|
|
ps_out = subprocess.check_output(["pgrep", "-a", "-f", "hermes"]).decode().strip()
|
|
hermes_pids = [line.split(None, 1) for line in ps_out.splitlines() if line.strip()]
|
|
except subprocess.CalledProcessError:
|
|
hermes_pids = []
|
|
|
|
# Python package versions (key ones)
|
|
key_packages = ["jupyterlab", "papermill", "requests"]
|
|
pkg_versions = {}
|
|
for pkg in key_packages:
|
|
try:
|
|
out = subprocess.check_output([sys.executable, "-m", "pip", "show", pkg], stderr=subprocess.DEVNULL).decode()
|
|
for line in out.splitlines():
|
|
if line.startswith("Version:"):
|
|
pkg_versions[pkg] = line.split(":", 1)[1].strip()
|
|
break
|
|
except Exception:
|
|
pkg_versions[pkg] = None
|
|
|
|
overall = "ok"
|
|
if load_status == "critical" or disk_status == "critical":
|
|
overall = "critical"
|
|
elif not hermes_pids:
|
|
overall = "warning"
|
|
|
|
return {
|
|
"timestamp": gather_time,
|
|
"overall": overall,
|
|
"load": {
|
|
"raw": load_raw if not load_raw.startswith("error:") else None,
|
|
"1min": load_values[0] if len(load_values) > 0 else None,
|
|
"5min": load_values[1] if len(load_values) > 1 else None,
|
|
"15min": load_values[2] if len(load_values) > 2 else None,
|
|
"avg": round(avg_load, 3) if avg_load is not None else None,
|
|
"threshold": threshold_load,
|
|
"status": load_status,
|
|
},
|
|
"disk": {
|
|
"total_gb": round(disk.total / (1024 ** 3), 2),
|
|
"used_gb": round(disk.used / (1024 ** 3), 2),
|
|
"free_gb": round(disk.free / (1024 ** 3), 2),
|
|
"used_percent": round(disk_percent, 2),
|
|
"threshold_percent": threshold_disk_percent,
|
|
"status": disk_status,
|
|
},
|
|
"memory": mem_stats,
|
|
"processes": {
|
|
"hermes_count": len(hermes_pids),
|
|
"hermes_pids": hermes_pids[:10],
|
|
},
|
|
"packages": pkg_versions,
|
|
}
|
|
|
|
|
|
def main(argv: List[str] = None) -> int:
|
|
argv = argv or sys.argv[1:]
|
|
parser = argparse.ArgumentParser(description="Fleet health monitor")
|
|
parser.add_argument("--threshold-load", type=float, default=1.0)
|
|
parser.add_argument("--threshold-disk", type=float, default=90.0)
|
|
parser.add_argument("--fail-on-critical", action="store_true", help="Exit non-zero if overall is critical")
|
|
args = parser.parse_args(argv)
|
|
|
|
report = check_health(args.threshold_load, args.threshold_disk)
|
|
print(json.dumps(report, indent=2))
|
|
if args.fail_on_critical and report.get("overall") == "critical":
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|