Files
the-nexus/bin/webhook_health_dashboard.py
Timmy Time 0b57145dde
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
[timmy] Add webhook health dashboard (#855) (#885)
2026-04-06 15:51:22 +00:00

276 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""
Webhook health dashboard for fleet agent endpoints.
Issue: #855 in Timmy_Foundation/the-nexus
Probes each configured /health endpoint, persists the last-known-good state to a
JSON log, and generates a markdown dashboard in ~/.hermes/burn-logs/.
Default targets:
- bezalel: http://127.0.0.1:8650/health
- allegro: http://127.0.0.1:8651/health
- ezra: http://127.0.0.1:8652/health
- adagio: http://127.0.0.1:8653/health
Environment overrides:
- WEBHOOK_HEALTH_TARGETS="allegro=http://127.0.0.1:8651/health,ezra=http://127.0.0.1:8652/health"
- WEBHOOK_HEALTH_TIMEOUT=3
- WEBHOOK_STALE_AFTER=300
- WEBHOOK_HEALTH_OUTPUT=/custom/webhook-health-latest.md
- WEBHOOK_HEALTH_HISTORY=/custom/webhook-health-history.json
"""
from __future__ import annotations
import argparse
import json
import os
import time
import urllib.error
import urllib.request
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
DEFAULT_TARGETS = {
"bezalel": "http://127.0.0.1:8650/health",
"allegro": "http://127.0.0.1:8651/health",
"ezra": "http://127.0.0.1:8652/health",
"adagio": "http://127.0.0.1:8653/health",
}
DEFAULT_TIMEOUT = float(os.environ.get("WEBHOOK_HEALTH_TIMEOUT", "3"))
DEFAULT_STALE_AFTER = int(os.environ.get("WEBHOOK_STALE_AFTER", "300"))
DEFAULT_OUTPUT = Path(
os.environ.get(
"WEBHOOK_HEALTH_OUTPUT",
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-latest.md"),
)
).expanduser()
DEFAULT_HISTORY = Path(
os.environ.get(
"WEBHOOK_HEALTH_HISTORY",
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-history.json"),
)
).expanduser()
@dataclass
class AgentHealth:
name: str
url: str
http_status: int | None
healthy: bool
latency_ms: int | None
stale: bool
last_success_ts: float | None
checked_at: float
message: str
def status_icon(self) -> str:
if self.healthy:
return "🟢"
if self.stale:
return "🔴"
return "🟠"
def last_success_age_seconds(self) -> int | None:
if self.last_success_ts is None:
return None
return max(0, int(self.checked_at - self.last_success_ts))
def parse_targets(raw: str | None) -> dict[str, str]:
if not raw:
return dict(DEFAULT_TARGETS)
targets: dict[str, str] = {}
for chunk in raw.split(","):
chunk = chunk.strip()
if not chunk:
continue
if "=" not in chunk:
raise ValueError(f"Invalid target spec: {chunk!r}")
name, url = chunk.split("=", 1)
targets[name.strip()] = url.strip()
if not targets:
raise ValueError("No valid targets parsed")
return targets
def load_history(path: Path) -> dict[str, Any]:
if not path.exists():
return {"agents": {}, "runs": []}
return json.loads(path.read_text(encoding="utf-8"))
def save_history(path: Path, history: dict[str, Any]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8")
def probe_health(url: str, timeout: float) -> tuple[bool, int | None, int | None, str]:
started = time.perf_counter()
req = urllib.request.Request(url, headers={"User-Agent": "the-nexus/webhook-health-dashboard"})
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read(512)
latency_ms = int((time.perf_counter() - started) * 1000)
status = getattr(resp, "status", None) or 200
message = f"HTTP {status}"
if body:
try:
payload = json.loads(body.decode("utf-8", errors="replace"))
if isinstance(payload, dict) and payload.get("status"):
message = f"HTTP {status}{payload['status']}"
except Exception:
pass
return 200 <= status < 300, status, latency_ms, message
except urllib.error.HTTPError as e:
latency_ms = int((time.perf_counter() - started) * 1000)
return False, e.code, latency_ms, f"HTTP {e.code}"
except urllib.error.URLError as e:
latency_ms = int((time.perf_counter() - started) * 1000)
return False, None, latency_ms, f"URL error: {e.reason}"
except Exception as e:
latency_ms = int((time.perf_counter() - started) * 1000)
return False, None, latency_ms, f"Probe failed: {e}"
def check_agents(
targets: dict[str, str],
history: dict[str, Any],
timeout: float = DEFAULT_TIMEOUT,
stale_after: int = DEFAULT_STALE_AFTER,
) -> list[AgentHealth]:
checked_at = time.time()
results: list[AgentHealth] = []
agent_state = history.setdefault("agents", {})
for name, url in targets.items():
state = agent_state.get(name, {})
last_success_ts = state.get("last_success_ts")
ok, http_status, latency_ms, message = probe_health(url, timeout)
if ok:
last_success_ts = checked_at
stale = False
if not ok and last_success_ts is not None:
stale = (checked_at - float(last_success_ts)) > stale_after
result = AgentHealth(
name=name,
url=url,
http_status=http_status,
healthy=ok,
latency_ms=latency_ms,
stale=stale,
last_success_ts=last_success_ts,
checked_at=checked_at,
message=message,
)
agent_state[name] = {
"url": url,
"last_success_ts": last_success_ts,
"last_http_status": http_status,
"last_message": message,
"last_checked_at": checked_at,
}
results.append(result)
history.setdefault("runs", []).append(
{
"checked_at": checked_at,
"healthy_count": sum(1 for r in results if r.healthy),
"unhealthy_count": sum(1 for r in results if not r.healthy),
"agents": [asdict(r) for r in results],
}
)
history["runs"] = history["runs"][-100:]
return results
def _format_age(seconds: int | None) -> str:
if seconds is None:
return "never"
if seconds < 60:
return f"{seconds}s ago"
if seconds < 3600:
return f"{seconds // 60}m ago"
return f"{seconds // 3600}h ago"
def to_markdown(results: list[AgentHealth], generated_at: float | None = None) -> str:
generated_at = generated_at or time.time()
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(generated_at))
healthy = sum(1 for r in results if r.healthy)
total = len(results)
lines = [
f"# Agent Webhook Health Dashboard — {ts}",
"",
f"Healthy: {healthy}/{total}",
"",
"| Agent | Status | HTTP | Latency | Last success | Endpoint | Notes |",
"|:------|:------:|:----:|--------:|:------------|:---------|:------|",
]
for result in results:
http = str(result.http_status) if result.http_status is not None else ""
latency = f"{result.latency_ms}ms" if result.latency_ms is not None else ""
lines.append(
"| {name} | {icon} | {http} | {latency} | {last_success} | `{url}` | {message} |".format(
name=result.name,
icon=result.status_icon(),
http=http,
latency=latency,
last_success=_format_age(result.last_success_age_seconds()),
url=result.url,
message=result.message,
)
)
stale_agents = [r.name for r in results if r.stale]
if stale_agents:
lines.extend([
"",
"## Stale agents",
", ".join(stale_agents),
])
lines.extend([
"",
"Generated by `bin/webhook_health_dashboard.py`.",
])
return "\n".join(lines)
def write_dashboard(path: Path, markdown: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(markdown + "\n", encoding="utf-8")
def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate webhook health dashboard")
parser.add_argument("--targets", default=os.environ.get("WEBHOOK_HEALTH_TARGETS"))
parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT)
parser.add_argument("--stale-after", type=int, default=DEFAULT_STALE_AFTER)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--history", type=Path, default=DEFAULT_HISTORY)
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv or sys.argv[1:])
targets = parse_targets(args.targets)
history = load_history(args.history)
results = check_agents(targets, history, timeout=args.timeout, stale_after=args.stale_after)
save_history(args.history, history)
dashboard = to_markdown(results)
write_dashboard(args.output, dashboard)
print(args.output)
print(f"healthy={sum(1 for r in results if r.healthy)} total={len(results)}")
return 0
if __name__ == "__main__":
import sys
raise SystemExit(main(sys.argv[1:]))