Compare commits
1 Commits
mimo/code/
...
timmy/issu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
50fc25da0d |
275
bin/webhook_health_dashboard.py
Normal file
275
bin/webhook_health_dashboard.py
Normal file
@@ -0,0 +1,275 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Webhook health dashboard for fleet agent endpoints.
|
||||||
|
|
||||||
|
Issue: #855 in Timmy_Foundation/the-nexus
|
||||||
|
|
||||||
|
Probes each configured /health endpoint, persists the last-known-good state to a
|
||||||
|
JSON log, and generates a markdown dashboard in ~/.hermes/burn-logs/.
|
||||||
|
|
||||||
|
Default targets:
|
||||||
|
- bezalel: http://127.0.0.1:8650/health
|
||||||
|
- allegro: http://127.0.0.1:8651/health
|
||||||
|
- ezra: http://127.0.0.1:8652/health
|
||||||
|
- adagio: http://127.0.0.1:8653/health
|
||||||
|
|
||||||
|
Environment overrides:
|
||||||
|
- WEBHOOK_HEALTH_TARGETS="allegro=http://127.0.0.1:8651/health,ezra=http://127.0.0.1:8652/health"
|
||||||
|
- WEBHOOK_HEALTH_TIMEOUT=3
|
||||||
|
- WEBHOOK_STALE_AFTER=300
|
||||||
|
- WEBHOOK_HEALTH_OUTPUT=/custom/webhook-health-latest.md
|
||||||
|
- WEBHOOK_HEALTH_HISTORY=/custom/webhook-health-history.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
DEFAULT_TARGETS = {
|
||||||
|
"bezalel": "http://127.0.0.1:8650/health",
|
||||||
|
"allegro": "http://127.0.0.1:8651/health",
|
||||||
|
"ezra": "http://127.0.0.1:8652/health",
|
||||||
|
"adagio": "http://127.0.0.1:8653/health",
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_TIMEOUT = float(os.environ.get("WEBHOOK_HEALTH_TIMEOUT", "3"))
|
||||||
|
DEFAULT_STALE_AFTER = int(os.environ.get("WEBHOOK_STALE_AFTER", "300"))
|
||||||
|
DEFAULT_OUTPUT = Path(
|
||||||
|
os.environ.get(
|
||||||
|
"WEBHOOK_HEALTH_OUTPUT",
|
||||||
|
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-latest.md"),
|
||||||
|
)
|
||||||
|
).expanduser()
|
||||||
|
DEFAULT_HISTORY = Path(
|
||||||
|
os.environ.get(
|
||||||
|
"WEBHOOK_HEALTH_HISTORY",
|
||||||
|
str(Path.home() / ".hermes" / "burn-logs" / "webhook-health-history.json"),
|
||||||
|
)
|
||||||
|
).expanduser()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AgentHealth:
|
||||||
|
name: str
|
||||||
|
url: str
|
||||||
|
http_status: int | None
|
||||||
|
healthy: bool
|
||||||
|
latency_ms: int | None
|
||||||
|
stale: bool
|
||||||
|
last_success_ts: float | None
|
||||||
|
checked_at: float
|
||||||
|
message: str
|
||||||
|
|
||||||
|
def status_icon(self) -> str:
|
||||||
|
if self.healthy:
|
||||||
|
return "🟢"
|
||||||
|
if self.stale:
|
||||||
|
return "🔴"
|
||||||
|
return "🟠"
|
||||||
|
|
||||||
|
def last_success_age_seconds(self) -> int | None:
|
||||||
|
if self.last_success_ts is None:
|
||||||
|
return None
|
||||||
|
return max(0, int(self.checked_at - self.last_success_ts))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_targets(raw: str | None) -> dict[str, str]:
|
||||||
|
if not raw:
|
||||||
|
return dict(DEFAULT_TARGETS)
|
||||||
|
targets: dict[str, str] = {}
|
||||||
|
for chunk in raw.split(","):
|
||||||
|
chunk = chunk.strip()
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
if "=" not in chunk:
|
||||||
|
raise ValueError(f"Invalid target spec: {chunk!r}")
|
||||||
|
name, url = chunk.split("=", 1)
|
||||||
|
targets[name.strip()] = url.strip()
|
||||||
|
if not targets:
|
||||||
|
raise ValueError("No valid targets parsed")
|
||||||
|
return targets
|
||||||
|
|
||||||
|
|
||||||
|
def load_history(path: Path) -> dict[str, Any]:
|
||||||
|
if not path.exists():
|
||||||
|
return {"agents": {}, "runs": []}
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def save_history(path: Path, history: dict[str, Any]) -> None:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(json.dumps(history, indent=2, sort_keys=True), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def probe_health(url: str, timeout: float) -> tuple[bool, int | None, int | None, str]:
|
||||||
|
started = time.perf_counter()
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": "the-nexus/webhook-health-dashboard"})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||||
|
body = resp.read(512)
|
||||||
|
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||||
|
status = getattr(resp, "status", None) or 200
|
||||||
|
message = f"HTTP {status}"
|
||||||
|
if body:
|
||||||
|
try:
|
||||||
|
payload = json.loads(body.decode("utf-8", errors="replace"))
|
||||||
|
if isinstance(payload, dict) and payload.get("status"):
|
||||||
|
message = f"HTTP {status} — {payload['status']}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return 200 <= status < 300, status, latency_ms, message
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||||
|
return False, e.code, latency_ms, f"HTTP {e.code}"
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||||
|
return False, None, latency_ms, f"URL error: {e.reason}"
|
||||||
|
except Exception as e:
|
||||||
|
latency_ms = int((time.perf_counter() - started) * 1000)
|
||||||
|
return False, None, latency_ms, f"Probe failed: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
def check_agents(
|
||||||
|
targets: dict[str, str],
|
||||||
|
history: dict[str, Any],
|
||||||
|
timeout: float = DEFAULT_TIMEOUT,
|
||||||
|
stale_after: int = DEFAULT_STALE_AFTER,
|
||||||
|
) -> list[AgentHealth]:
|
||||||
|
checked_at = time.time()
|
||||||
|
results: list[AgentHealth] = []
|
||||||
|
agent_state = history.setdefault("agents", {})
|
||||||
|
|
||||||
|
for name, url in targets.items():
|
||||||
|
state = agent_state.get(name, {})
|
||||||
|
last_success_ts = state.get("last_success_ts")
|
||||||
|
ok, http_status, latency_ms, message = probe_health(url, timeout)
|
||||||
|
if ok:
|
||||||
|
last_success_ts = checked_at
|
||||||
|
stale = False
|
||||||
|
if not ok and last_success_ts is not None:
|
||||||
|
stale = (checked_at - float(last_success_ts)) > stale_after
|
||||||
|
result = AgentHealth(
|
||||||
|
name=name,
|
||||||
|
url=url,
|
||||||
|
http_status=http_status,
|
||||||
|
healthy=ok,
|
||||||
|
latency_ms=latency_ms,
|
||||||
|
stale=stale,
|
||||||
|
last_success_ts=last_success_ts,
|
||||||
|
checked_at=checked_at,
|
||||||
|
message=message,
|
||||||
|
)
|
||||||
|
agent_state[name] = {
|
||||||
|
"url": url,
|
||||||
|
"last_success_ts": last_success_ts,
|
||||||
|
"last_http_status": http_status,
|
||||||
|
"last_message": message,
|
||||||
|
"last_checked_at": checked_at,
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
history.setdefault("runs", []).append(
|
||||||
|
{
|
||||||
|
"checked_at": checked_at,
|
||||||
|
"healthy_count": sum(1 for r in results if r.healthy),
|
||||||
|
"unhealthy_count": sum(1 for r in results if not r.healthy),
|
||||||
|
"agents": [asdict(r) for r in results],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
history["runs"] = history["runs"][-100:]
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _format_age(seconds: int | None) -> str:
|
||||||
|
if seconds is None:
|
||||||
|
return "never"
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{seconds}s ago"
|
||||||
|
if seconds < 3600:
|
||||||
|
return f"{seconds // 60}m ago"
|
||||||
|
return f"{seconds // 3600}h ago"
|
||||||
|
|
||||||
|
|
||||||
|
def to_markdown(results: list[AgentHealth], generated_at: float | None = None) -> str:
|
||||||
|
generated_at = generated_at or time.time()
|
||||||
|
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(generated_at))
|
||||||
|
healthy = sum(1 for r in results if r.healthy)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"# Agent Webhook Health Dashboard — {ts}",
|
||||||
|
"",
|
||||||
|
f"Healthy: {healthy}/{total}",
|
||||||
|
"",
|
||||||
|
"| Agent | Status | HTTP | Latency | Last success | Endpoint | Notes |",
|
||||||
|
"|:------|:------:|:----:|--------:|:------------|:---------|:------|",
|
||||||
|
]
|
||||||
|
for result in results:
|
||||||
|
http = str(result.http_status) if result.http_status is not None else "—"
|
||||||
|
latency = f"{result.latency_ms}ms" if result.latency_ms is not None else "—"
|
||||||
|
lines.append(
|
||||||
|
"| {name} | {icon} | {http} | {latency} | {last_success} | `{url}` | {message} |".format(
|
||||||
|
name=result.name,
|
||||||
|
icon=result.status_icon(),
|
||||||
|
http=http,
|
||||||
|
latency=latency,
|
||||||
|
last_success=_format_age(result.last_success_age_seconds()),
|
||||||
|
url=result.url,
|
||||||
|
message=result.message,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
stale_agents = [r.name for r in results if r.stale]
|
||||||
|
if stale_agents:
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
"## Stale agents",
|
||||||
|
", ".join(stale_agents),
|
||||||
|
])
|
||||||
|
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
"Generated by `bin/webhook_health_dashboard.py`.",
|
||||||
|
])
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def write_dashboard(path: Path, markdown: str) -> None:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(markdown + "\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Generate webhook health dashboard")
|
||||||
|
parser.add_argument("--targets", default=os.environ.get("WEBHOOK_HEALTH_TARGETS"))
|
||||||
|
parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT)
|
||||||
|
parser.add_argument("--stale-after", type=int, default=DEFAULT_STALE_AFTER)
|
||||||
|
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
||||||
|
parser.add_argument("--history", type=Path, default=DEFAULT_HISTORY)
|
||||||
|
return parser.parse_args(argv)
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
args = parse_args(argv or sys.argv[1:])
|
||||||
|
targets = parse_targets(args.targets)
|
||||||
|
history = load_history(args.history)
|
||||||
|
results = check_agents(targets, history, timeout=args.timeout, stale_after=args.stale_after)
|
||||||
|
save_history(args.history, history)
|
||||||
|
dashboard = to_markdown(results)
|
||||||
|
write_dashboard(args.output, dashboard)
|
||||||
|
print(args.output)
|
||||||
|
print(f"healthy={sum(1 for r in results if r.healthy)} total={len(results)}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
raise SystemExit(main(sys.argv[1:]))
|
||||||
120
tests/test_webhook_health_dashboard.py
Normal file
120
tests/test_webhook_health_dashboard.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
"""Tests for webhook health dashboard generation."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
|
||||||
|
_spec = importlib.util.spec_from_file_location(
|
||||||
|
"webhook_health_dashboard_test",
|
||||||
|
PROJECT_ROOT / "bin" / "webhook_health_dashboard.py",
|
||||||
|
)
|
||||||
|
_mod = importlib.util.module_from_spec(_spec)
|
||||||
|
sys.modules["webhook_health_dashboard_test"] = _mod
|
||||||
|
_spec.loader.exec_module(_mod)
|
||||||
|
|
||||||
|
AgentHealth = _mod.AgentHealth
|
||||||
|
check_agents = _mod.check_agents
|
||||||
|
load_history = _mod.load_history
|
||||||
|
parse_targets = _mod.parse_targets
|
||||||
|
save_history = _mod.save_history
|
||||||
|
sort_key = None
|
||||||
|
to_markdown = _mod.to_markdown
|
||||||
|
write_dashboard = _mod.write_dashboard
|
||||||
|
main = _mod.main
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseTargets:
|
||||||
|
def test_defaults_when_none(self):
|
||||||
|
targets = parse_targets(None)
|
||||||
|
assert targets["allegro"].endswith(":8651/health")
|
||||||
|
assert targets["ezra"].endswith(":8652/health")
|
||||||
|
|
||||||
|
def test_parse_csv_mapping(self):
|
||||||
|
targets = parse_targets("alpha=http://a/health,beta=http://b/health")
|
||||||
|
assert targets == {
|
||||||
|
"alpha": "http://a/health",
|
||||||
|
"beta": "http://b/health",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestCheckAgents:
|
||||||
|
@patch("webhook_health_dashboard_test.probe_health")
|
||||||
|
def test_updates_last_success_for_healthy(self, mock_probe):
|
||||||
|
mock_probe.return_value = (True, 200, 42, "HTTP 200")
|
||||||
|
history = {"agents": {}, "runs": []}
|
||||||
|
results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].healthy is True
|
||||||
|
assert history["agents"]["allegro"]["last_success_ts"] is not None
|
||||||
|
|
||||||
|
@patch("webhook_health_dashboard_test.probe_health")
|
||||||
|
def test_marks_stale_after_threshold(self, mock_probe):
|
||||||
|
mock_probe.return_value = (False, None, 12, "URL error: refused")
|
||||||
|
history = {
|
||||||
|
"agents": {
|
||||||
|
"allegro": {
|
||||||
|
"last_success_ts": time.time() - 301,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"runs": [],
|
||||||
|
}
|
||||||
|
results = check_agents({"allegro": "http://localhost:8651/health"}, history, timeout=1, stale_after=300)
|
||||||
|
assert results[0].healthy is False
|
||||||
|
assert results[0].stale is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestMarkdown:
|
||||||
|
def test_contains_table_and_icons(self):
|
||||||
|
now = time.time()
|
||||||
|
results = [
|
||||||
|
AgentHealth("allegro", "http://localhost:8651/health", 200, True, 31, False, now - 5, now, "HTTP 200 — ok"),
|
||||||
|
AgentHealth("ezra", "http://localhost:8652/health", None, False, 14, True, now - 600, now, "URL error: refused"),
|
||||||
|
]
|
||||||
|
md = to_markdown(results, generated_at=now)
|
||||||
|
assert "| Agent | Status | HTTP |" in md
|
||||||
|
assert "🟢" in md
|
||||||
|
assert "🔴" in md
|
||||||
|
assert "Stale agents" in md
|
||||||
|
assert "ezra" in md
|
||||||
|
|
||||||
|
|
||||||
|
class TestFileIO:
|
||||||
|
def test_save_and_load_history(self, tmp_path):
|
||||||
|
path = tmp_path / "history.json"
|
||||||
|
payload = {"agents": {"a": {"last_success_ts": 1}}, "runs": []}
|
||||||
|
save_history(path, payload)
|
||||||
|
loaded = load_history(path)
|
||||||
|
assert loaded == payload
|
||||||
|
|
||||||
|
def test_write_dashboard(self, tmp_path):
|
||||||
|
out = tmp_path / "dashboard.md"
|
||||||
|
write_dashboard(out, "# Test")
|
||||||
|
assert out.read_text() == "# Test\n"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMain:
|
||||||
|
@patch("webhook_health_dashboard_test.probe_health")
|
||||||
|
def test_main_writes_outputs(self, mock_probe, tmp_path):
|
||||||
|
mock_probe.return_value = (True, 200, 10, "HTTP 200")
|
||||||
|
output = tmp_path / "dashboard.md"
|
||||||
|
history = tmp_path / "history.json"
|
||||||
|
rc = main([
|
||||||
|
"--targets", "allegro=http://localhost:8651/health",
|
||||||
|
"--output", str(output),
|
||||||
|
"--history", str(history),
|
||||||
|
"--timeout", "1",
|
||||||
|
"--stale-after", "300",
|
||||||
|
])
|
||||||
|
assert rc == 0
|
||||||
|
assert output.exists()
|
||||||
|
assert history.exists()
|
||||||
|
assert "allegro" in output.read_text()
|
||||||
|
runs = json.loads(history.read_text())["runs"]
|
||||||
|
assert len(runs) == 1
|
||||||
Reference in New Issue
Block a user