From 50fc25da0d239819903ab271d9a2b728c4981d3f Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Mon, 6 Apr 2026 11:45:55 -0400 Subject: [PATCH] feat: add webhook health dashboard (#855) --- bin/webhook_health_dashboard.py | 275 +++++++++++++++++++++++++ tests/test_webhook_health_dashboard.py | 120 +++++++++++ 2 files changed, 395 insertions(+) create mode 100644 bin/webhook_health_dashboard.py create mode 100644 tests/test_webhook_health_dashboard.py diff --git a/bin/webhook_health_dashboard.py b/bin/webhook_health_dashboard.py new file mode 100644 index 0000000..2dd6db2 --- /dev/null +++ b/bin/webhook_health_dashboard.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +Webhook health dashboard for fleet agent endpoints. + +Issue: #855 in Timmy_Foundation/the-nexus + +Probes each configured /health endpoint, persists the last-known-good state to a +JSON log, and generates a markdown dashboard in ~/.hermes/burn-logs/. + +Default targets: +- bezalel: http://127.0.0.1:8650/health +- allegro: http://127.0.0.1:8651/health +- ezra: http://127.0.0.1:8652/health +- adagio: http://127.0.0.1:8653/health + +Environment overrides: +- WEBHOOK_HEALTH_TARGETS="allegro=http://127.0.0.1:8651/health,ezra=http://127.0.0.1:8652/health" +- WEBHOOK_HEALTH_TIMEOUT=3 +- WEBHOOK_STALE_AFTER=300 +- WEBHOOK_HEALTH_OUTPUT=/custom/webhook-health-latest.md +- WEBHOOK_HEALTH_HISTORY=/custom/webhook-health-history.json +""" + +from __future__ import annotations + +import argparse +import json +import os +import time +import urllib.error +import urllib.request +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any + +DEFAULT_TARGETS = { + "bezalel": "http://127.0.0.1:8650/health", + "allegro": "http://127.0.0.1:8651/health", + "ezra": "http://127.0.0.1:8652/health", + "adagio": "http://127.0.0.1:8653/health", +} + +DEFAULT_TIMEOUT = float(os.environ.get("WEBHOOK_HEALTH_TIMEOUT", "3")) +DEFAULT_STALE_AFTER = int(os.environ.get("WEBHOOK_STALE_AFTER", "300")) +DEFAULT_OUTPUT = Path( + os.environ.get( + "WEBHOOK_HEALTH_OUTPUT", + str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-latest.md"), + ) +).expanduser() +DEFAULT_HISTORY = Path( + os.environ.get( + "WEBHOOK_HEALTH_HISTORY", + str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-history.json"), + ) +).expanduser() + + +@dataclass +class AgentHealth: + name: str + url: str + http_status: int | None + healthy: bool + latency_ms: int | None + stale: bool + last_success_ts: float | None + checked_at: float + message: str + + def status_icon(self) -> str: + if self.healthy: + return "🟢" + if self.stale: + return "🔴" + return "🟠" + + def last_success_age_seconds(self) -> int | None: + if self.last_success_ts is None: + return None + return max(0, int(self.checked_at - self.last_success_ts)) + + +def parse_targets(raw: str | None) -> dict[str, str]: + if not raw: + return dict(DEFAULT_TARGETS) + targets: dict[str, str] = {} + for chunk in raw.split(","): + chunk = chunk.strip() + if not chunk: + continue + if "=" not in chunk: + raise ValueError(f"Invalid target spec: {chunk!r}") + name, url = chunk.split("=", 1) + targets[name.strip()] = url.strip() + if not targets: + raise ValueError("No valid targets parsed") + return targets + + +def load_history(path: Path) -> dict[str, Any]: + if not path.exists(): + return {"agents": {}, "runs": []} + return json.loads(path.read_text(encoding="utf-8")) + + +def save_history(path: Path, history: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8") + + +def probe_health(url: str, timeout: float) -> tuple[bool, int | None, int | None, str]: + started = time.perf_counter() + req = urllib.request.Request(url, headers={"User-Agent": "the-nexus/webhook-health-dashboard"}) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + body = resp.read(512) + latency_ms = int((time.perf_counter() - started) * 1000) + status = getattr(resp, "status", None) or 200 + message = f"HTTP {status}" + if body: + try: + payload = json.loads(body.decode("utf-8", errors="replace")) + if isinstance(payload, dict) and payload.get("status"): + message = f"HTTP {status} — {payload['status']}" + except Exception: + pass + return 200 <= status < 300, status, latency_ms, message + except urllib.error.HTTPError as e: + latency_ms = int((time.perf_counter() - started) * 1000) + return False, e.code, latency_ms, f"HTTP {e.code}" + except urllib.error.URLError as e: + latency_ms = int((time.perf_counter() - started) * 1000) + return False, None, latency_ms, f"URL error: {e.reason}" + except Exception as e: + latency_ms = int((time.perf_counter() - started) * 1000) + return False, None, latency_ms, f"Probe failed: {e}" + + +def check_agents( + targets: dict[str, str], + history: dict[str, Any], + timeout: float = DEFAULT_TIMEOUT, + stale_after: int = DEFAULT_STALE_AFTER, +) -> list[AgentHealth]: + checked_at = time.time() + results: list[AgentHealth] = [] + agent_state = history.setdefault("agents", {}) + + for name, url in targets.items(): + state = agent_state.get(name, {}) + last_success_ts = state.get("last_success_ts") + ok, http_status, latency_ms, message = probe_health(url, timeout) + if ok: + last_success_ts = checked_at + stale = False + if not ok and last_success_ts is not None: + stale = (checked_at - float(last_success_ts)) > stale_after + result = AgentHealth( + name=name, + url=url, + http_status=http_status, + healthy=ok, + latency_ms=latency_ms, + stale=stale, + last_success_ts=last_success_ts, + checked_at=checked_at, + message=message, + ) + agent_state[name] = { + "url": url, + "last_success_ts": last_success_ts, + "last_http_status": http_status, + "last_message": message, + "last_checked_at": checked_at, + } + results.append(result) + + history.setdefault("runs", []).append( + { + "checked_at": checked_at, + "healthy_count": sum(1 for r in results if r.healthy), + "unhealthy_count": sum(1 for r in results if not r.healthy), + "agents": [asdict(r) for r in results], + } + ) + history["runs"] = history["runs"][-100:] + return results + + +def _format_age(seconds: int | None) -> str: + if seconds is None: + return "never" + if seconds < 60: + return f"{seconds}s ago" + if seconds < 3600: + return f"{seconds // 60}m ago" + return f"{seconds // 3600}h ago" + + +def to_markdown(results: list[AgentHealth], generated_at: float | None = None) -> str: + generated_at = generated_at or time.time() + ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(generated_at)) + healthy = sum(1 for r in results if r.healthy) + total = len(results) + + lines = [ + f"# Agent Webhook Health Dashboard — {ts}", + "", + f"Healthy: {healthy}/{total}", + "", + "| Agent | Status | HTTP | Latency | Last success | Endpoint | Notes |", + "|:------|:------:|:----:|--------:|:------------|:---------|:------|", + ] + for result in results: + http = str(result.http_status) if result.http_status is not None else "—" + latency = f"{result.latency_ms}ms" if result.latency_ms is not None else "—" + lines.append( + "| {name} | {icon} | {http} | {latency} | {last_success} | `{url}` | {message} |".format( + name=result.name, + icon=result.status_icon(), + http=http, + latency=latency, + last_success=_format_age(result.last_success_age_seconds()), + url=result.url, + message=result.message, + ) + ) + + stale_agents = [r.name for r in results if r.stale] + if stale_agents: + lines.extend([ + "", + "## Stale agents", + ", ".join(stale_agents), + ]) + + lines.extend([ + "", + "Generated by `bin/webhook_health_dashboard.py`.", + ]) + return "\n".join(lines) + + +def write_dashboard(path: Path, markdown: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(markdown + "\n", encoding="utf-8") + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate webhook health dashboard") + parser.add_argument("--targets", default=os.environ.get("WEBHOOK_HEALTH_TARGETS")) + parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT) + parser.add_argument("--stale-after", type=int, default=DEFAULT_STALE_AFTER) + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + parser.add_argument("--history", type=Path, default=DEFAULT_HISTORY) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv or sys.argv[1:]) + targets = parse_targets(args.targets) + history = load_history(args.history) + results = check_agents(targets, history, timeout=args.timeout, stale_after=args.stale_after) + save_history(args.history, history) + dashboard = to_markdown(results) + write_dashboard(args.output, dashboard) + print(args.output) + print(f"healthy={sum(1 for r in results if r.healthy)} total={len(results)}") + return 0 + + +if __name__ == "__main__": + import sys + raise SystemExit(main(sys.argv[1:])) diff --git a/tests/test_webhook_health_dashboard.py b/tests/test_webhook_health_dashboard.py new file mode 100644 index 0000000..b05c1be --- /dev/null +++ b/tests/test_webhook_health_dashboard.py @@ -0,0 +1,120 @@ +"""Tests for webhook health dashboard generation.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +import time +from pathlib import Path +from unittest.mock import patch + +PROJECT_ROOT = Path(__file__).parent.parent + +_spec = importlib.util.spec_from_file_location( + "webhook_health_dashboard_test", + PROJECT_ROOT / "bin" / "webhook_health_dashboard.py", +) +_mod = importlib.util.module_from_spec(_spec) +sys.modules["webhook_health_dashboard_test"] = _mod +_spec.loader.exec_module(_mod) + +AgentHealth = _mod.AgentHealth +check_agents = _mod.check_agents +load_history = _mod.load_history +parse_targets = _mod.parse_targets +save_history = _mod.save_history +sort_key = None +to_markdown = _mod.to_markdown +write_dashboard = _mod.write_dashboard +main = _mod.main + + +class TestParseTargets: + def test_defaults_when_none(self): + targets = parse_targets(None) + assert targets["allegro"].endswith(":8651/health") + assert targets["ezra"].endswith(":8652/health") + + def test_parse_csv_mapping(self): + targets = parse_targets("alpha=http://a/health,beta=http://b/health") + assert targets == { + "alpha": "http://a/health", + "beta": "http://b/health", + } + + +class TestCheckAgents: + @patch("webhook_health_dashboard_test.probe_health") + def test_updates_last_success_for_healthy(self, mock_probe): + mock_probe.return_value = (True, 200, 42, "HTTP 200") + history = {"agents": {}, "runs": []} + results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300) + assert len(results) == 1 + assert results[0].healthy is True + assert history["agents"]["allegro"]["last_success_ts"] is not None + + @patch("webhook_health_dashboard_test.probe_health") + def test_marks_stale_after_threshold(self, mock_probe): + mock_probe.return_value = (False, None, 12, "URL error: refused") + history = { + "agents": { + "allegro": { + "last_success_ts": time.time() - 301, + } + }, + "runs": [], + } + results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300) + assert results[0].healthy is False + assert results[0].stale is True + + +class TestMarkdown: + def test_contains_table_and_icons(self): + now = time.time() + results = [ + AgentHealth("allegro", "http://localhost:8651/health", 200, True, 31, False, now - 5, now, "HTTP 200 — ok"), + AgentHealth("ezra", "http://localhost:8652/health", None, False, 14, True, now - 600, now, "URL error: refused"), + ] + md = to_markdown(results, generated_at=now) + assert "| Agent | Status | HTTP |" in md + assert "🟢" in md + assert "🔴" in md + assert "Stale agents" in md + assert "ezra" in md + + +class TestFileIO: + def test_save_and_load_history(self, tmp_path): + path = tmp_path / "history.json" + payload = {"agents": {"a": {"last_success_ts": 1}}, "runs": []} + save_history(path, payload) + loaded = load_history(path) + assert loaded == payload + + def test_write_dashboard(self, tmp_path): + out = tmp_path / "dashboard.md" + write_dashboard(out, "# Test") + assert out.read_text() == "# Test\n" + + +class TestMain: + @patch("webhook_health_dashboard_test.probe_health") + def test_main_writes_outputs(self, mock_probe, tmp_path): + mock_probe.return_value = (True, 200, 10, "HTTP 200") + output = tmp_path / "dashboard.md" + history = tmp_path / "history.json" + rc = main([ + "--targets", "allegro=http://localhost:8651/health", + "--output", str(output), + "--history", str(history), + "--timeout", "1", + "--stale-after", "300", + ]) + assert rc == 0 + assert output.exists() + assert history.exists() + assert "allegro" in output.read_text() + runs = json.loads(history.read_text())["runs"] + assert len(runs) == 1 -- 2.43.0