#!/usr/bin/env python3 """ Webhook health dashboard for fleet agent endpoints. Issue: #855 in Timmy_Foundation/the-nexus Probes each configured /health endpoint, persists the last-known-good state to a JSON log, and generates a markdown dashboard in ~/.hermes/burn-logs/. Default targets: - bezalel: http://127.0.0.1:8650/health - allegro: http://127.0.0.1:8651/health - ezra: http://127.0.0.1:8652/health - adagio: http://127.0.0.1:8653/health Environment overrides: - WEBHOOK_HEALTH_TARGETS="allegro=http://127.0.0.1:8651/health,ezra=http://127.0.0.1:8652/health" - WEBHOOK_HEALTH_TIMEOUT=3 - WEBHOOK_STALE_AFTER=300 - WEBHOOK_HEALTH_OUTPUT=/custom/webhook-health-latest.md - WEBHOOK_HEALTH_HISTORY=/custom/webhook-health-history.json """ from __future__ import annotations import argparse import json import os import time import urllib.error import urllib.request from dataclasses import asdict, dataclass from pathlib import Path from typing import Any DEFAULT_TARGETS = { "bezalel": "http://127.0.0.1:8650/health", "allegro": "http://127.0.0.1:8651/health", "ezra": "http://127.0.0.1:8652/health", "adagio": "http://127.0.0.1:8653/health", } DEFAULT_TIMEOUT = float(os.environ.get("WEBHOOK_HEALTH_TIMEOUT", "3")) DEFAULT_STALE_AFTER = int(os.environ.get("WEBHOOK_STALE_AFTER", "300")) DEFAULT_OUTPUT = Path( os.environ.get( "WEBHOOK_HEALTH_OUTPUT", str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-latest.md"), ) ).expanduser() DEFAULT_HISTORY = Path( os.environ.get( "WEBHOOK_HEALTH_HISTORY", str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-history.json"), ) ).expanduser() @dataclass class AgentHealth: name: str url: str http_status: int | None healthy: bool latency_ms: int | None stale: bool last_success_ts: float | None checked_at: float message: str def status_icon(self) -> str: if self.healthy: return "🟢" if self.stale: return "🔴" return "🟠" def last_success_age_seconds(self) -> int | None: if self.last_success_ts is None: return None return max(0, int(self.checked_at - self.last_success_ts)) def parse_targets(raw: str | None) -> dict[str, str]: if not raw: return dict(DEFAULT_TARGETS) targets: dict[str, str] = {} for chunk in raw.split(","): chunk = chunk.strip() if not chunk: continue if "=" not in chunk: raise ValueError(f"Invalid target spec: {chunk!r}") name, url = chunk.split("=", 1) targets[name.strip()] = url.strip() if not targets: raise ValueError("No valid targets parsed") return targets def load_history(path: Path) -> dict[str, Any]: if not path.exists(): return {"agents": {}, "runs": []} return json.loads(path.read_text(encoding="utf-8")) def save_history(path: Path, history: dict[str, Any]) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8") def probe_health(url: str, timeout: float) -> tuple[bool, int | None, int | None, str]: started = time.perf_counter() req = urllib.request.Request(url, headers={"User-Agent": "the-nexus/webhook-health-dashboard"}) try: with urllib.request.urlopen(req, timeout=timeout) as resp: body = resp.read(512) latency_ms = int((time.perf_counter() - started) * 1000) status = getattr(resp, "status", None) or 200 message = f"HTTP {status}" if body: try: payload = json.loads(body.decode("utf-8", errors="replace")) if isinstance(payload, dict) and payload.get("status"): message = f"HTTP {status} — {payload['status']}" except Exception: pass return 200 <= status < 300, status, latency_ms, message except urllib.error.HTTPError as e: latency_ms = int((time.perf_counter() - started) * 1000) return False, e.code, latency_ms, f"HTTP {e.code}" except urllib.error.URLError as e: latency_ms = int((time.perf_counter() - started) * 1000) return False, None, latency_ms, f"URL error: {e.reason}" except Exception as e: latency_ms = int((time.perf_counter() - started) * 1000) return False, None, latency_ms, f"Probe failed: {e}" def check_agents( targets: dict[str, str], history: dict[str, Any], timeout: float = DEFAULT_TIMEOUT, stale_after: int = DEFAULT_STALE_AFTER, ) -> list[AgentHealth]: checked_at = time.time() results: list[AgentHealth] = [] agent_state = history.setdefault("agents", {}) for name, url in targets.items(): state = agent_state.get(name, {}) last_success_ts = state.get("last_success_ts") ok, http_status, latency_ms, message = probe_health(url, timeout) if ok: last_success_ts = checked_at stale = False if not ok and last_success_ts is not None: stale = (checked_at - float(last_success_ts)) > stale_after result = AgentHealth( name=name, url=url, http_status=http_status, healthy=ok, latency_ms=latency_ms, stale=stale, last_success_ts=last_success_ts, checked_at=checked_at, message=message, ) agent_state[name] = { "url": url, "last_success_ts": last_success_ts, "last_http_status": http_status, "last_message": message, "last_checked_at": checked_at, } results.append(result) history.setdefault("runs", []).append( { "checked_at": checked_at, "healthy_count": sum(1 for r in results if r.healthy), "unhealthy_count": sum(1 for r in results if not r.healthy), "agents": [asdict(r) for r in results], } ) history["runs"] = history["runs"][-100:] return results def _format_age(seconds: int | None) -> str: if seconds is None: return "never" if seconds < 60: return f"{seconds}s ago" if seconds < 3600: return f"{seconds // 60}m ago" return f"{seconds // 3600}h ago" def to_markdown(results: list[AgentHealth], generated_at: float | None = None) -> str: generated_at = generated_at or time.time() ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(generated_at)) healthy = sum(1 for r in results if r.healthy) total = len(results) lines = [ f"# Agent Webhook Health Dashboard — {ts}", "", f"Healthy: {healthy}/{total}", "", "| Agent | Status | HTTP | Latency | Last success | Endpoint | Notes |", "|:------|:------:|:----:|--------:|:------------|:---------|:------|", ] for result in results: http = str(result.http_status) if result.http_status is not None else "—" latency = f"{result.latency_ms}ms" if result.latency_ms is not None else "—" lines.append( "| {name} | {icon} | {http} | {latency} | {last_success} | `{url}` | {message} |".format( name=result.name, icon=result.status_icon(), http=http, latency=latency, last_success=_format_age(result.last_success_age_seconds()), url=result.url, message=result.message, ) ) stale_agents = [r.name for r in results if r.stale] if stale_agents: lines.extend([ "", "## Stale agents", ", ".join(stale_agents), ]) lines.extend([ "", "Generated by `bin/webhook_health_dashboard.py`.", ]) return "\n".join(lines) def write_dashboard(path: Path, markdown: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(markdown + "\n", encoding="utf-8") def parse_args(argv: list[str]) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generate webhook health dashboard") parser.add_argument("--targets", default=os.environ.get("WEBHOOK_HEALTH_TARGETS")) parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT) parser.add_argument("--stale-after", type=int, default=DEFAULT_STALE_AFTER) parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) parser.add_argument("--history", type=Path, default=DEFAULT_HISTORY) return parser.parse_args(argv) def main(argv: list[str] | None = None) -> int: args = parse_args(argv or sys.argv[1:]) targets = parse_targets(args.targets) history = load_history(args.history) results = check_agents(targets, history, timeout=args.timeout, stale_after=args.stale_after) save_history(args.history, history) dashboard = to_markdown(results) write_dashboard(args.output, dashboard) print(args.output) print(f"healthy={sum(1 for r in results if r.healthy)} total={len(results)}") return 0 if __name__ == "__main__": import sys raise SystemExit(main(sys.argv[1:]))