Files
the-nexus/bin/check_cron_heartbeats.py
2026-04-07 14:38:55 +00:00

450 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Meta-heartbeat checker — makes silent cron failures impossible.
Reads every ``*.last`` file in the heartbeat directory and verifies that no
job has been silent for longer than **2× its declared interval**. If any job
is stale, a Gitea alert issue is created (or an existing one is updated).
When all jobs recover, the issue is closed automatically.
This script itself should be run as a cron job every 15 minutes so the
meta-level is also covered:
*/15 * * * * cd /path/to/the-nexus && \\
python bin/check_cron_heartbeats.py >> /var/log/bezalel/heartbeat-check.log 2>&1
USAGE
-----
# Check all jobs; create/update Gitea alert if any stale:
python bin/check_cron_heartbeats.py
# Dry-run (no Gitea writes):
python bin/check_cron_heartbeats.py --dry-run
# Output Night Watch heartbeat panel markdown:
python bin/check_cron_heartbeats.py --panel
# Output JSON (for integration with other tools):
python bin/check_cron_heartbeats.py --json
# Use a custom heartbeat directory:
python bin/check_cron_heartbeats.py --dir /tmp/test-heartbeats
HEARTBEAT DIRECTORY
-------------------
Primary: /var/run/bezalel/heartbeats/ (set by ops, writable by cron user)
Fallback: ~/.bezalel/heartbeats/ (dev machines)
Override: BEZALEL_HEARTBEAT_DIR env var
ZERO DEPENDENCIES
-----------------
Pure stdlib. No pip installs required.
Refs: #1096
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("bezalel.heartbeat_checker")
# ── Configuration ─────────────────────────────────────────────────────
PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus")
ALERT_TITLE_PREFIX = "[heartbeat-checker]"
# A job is stale when its age exceeds this multiple of its declared interval
STALE_RATIO = 2.0
# Never flag a job as stale if it completed less than this many seconds ago
# (prevents noise immediately after deployment)
MIN_STALE_AGE = 60
def _resolve_heartbeat_dir() -> Path:
"""Return the active heartbeat directory."""
env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
if env:
return Path(env)
if PRIMARY_HEARTBEAT_DIR.exists():
return PRIMARY_HEARTBEAT_DIR
# Try to create it; fall back to home dir if not permitted
try:
PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
probe.touch()
probe.unlink()
return PRIMARY_HEARTBEAT_DIR
except (PermissionError, OSError):
return FALLBACK_HEARTBEAT_DIR
# ── Data model ────────────────────────────────────────────────────────
@dataclass
class JobStatus:
"""Health status for a single cron job's heartbeat."""
job: str
path: Path
healthy: bool
age_seconds: float # -1 if unknown (missing/corrupt)
interval_seconds: int # 0 if unknown
staleness_ratio: float # age / interval; -1 if unknown; >STALE_RATIO = stale
last_timestamp: Optional[float]
pid: Optional[int]
raw_status: str # value from the .last file: "ok" / "warn" / "error"
message: str
@dataclass
class HeartbeatReport:
"""Aggregate report for all cron job heartbeats in a directory."""
timestamp: float
heartbeat_dir: Path
jobs: List[JobStatus] = field(default_factory=list)
@property
def stale_jobs(self) -> List[JobStatus]:
return [j for j in self.jobs if not j.healthy]
@property
def overall_healthy(self) -> bool:
return len(self.stale_jobs) == 0
# ── Rendering ─────────────────────────────────────────────────────
def to_panel_markdown(self) -> str:
"""Night Watch heartbeat panel — a table of all jobs with their status."""
ts = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime(self.timestamp))
overall = "OK" if self.overall_healthy else "ALERT"
lines = [
f"## Heartbeat Panel — {ts}",
"",
f"**Overall:** {overall}",
"",
"| Job | Status | Age | Interval | Ratio |",
"|-----|--------|-----|----------|-------|",
]
if not self.jobs:
lines.append("| *(no heartbeat files found)* | — | — | — | — |")
else:
for j in self.jobs:
icon = "OK" if j.healthy else "STALE"
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
lines.append(
f"| `{j.job}` | {icon} | {age_str} | {interval_str} | {ratio_str} |"
)
if self.stale_jobs:
lines += ["", "**Stale jobs:**"]
for j in self.stale_jobs:
lines.append(f"- `{j.job}`: {j.message}")
lines += [
"",
f"*Heartbeat dir: `{self.heartbeat_dir}`*",
]
return "\n".join(lines)
def to_alert_body(self) -> str:
"""Gitea issue body when stale jobs are detected."""
ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp))
stale = self.stale_jobs
lines = [
f"## Cron Heartbeat Alert — {ts}",
"",
f"**{len(stale)} job(s) have gone silent** (stale > {STALE_RATIO}x interval).",
"",
"| Job | Age | Interval | Ratio | Detail |",
"|-----|-----|----------|-------|--------|",
]
for j in stale:
age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A"
interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A"
ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A"
lines.append(
f"| `{j.job}` | {age_str} | {interval_str} | {ratio_str} | {j.message} |"
)
lines += [
"",
"### What to do",
"1. `crontab -l` — confirm the job is still scheduled",
"2. Check the job's log for errors",
"3. Restart the job if needed",
"4. Close this issue once fresh heartbeats appear",
"",
f"*Generated by `check_cron_heartbeats.py` — dir: `{self.heartbeat_dir}`*",
]
return "\n".join(lines)
def to_json(self) -> Dict[str, Any]:
return {
"healthy": self.overall_healthy,
"timestamp": self.timestamp,
"heartbeat_dir": str(self.heartbeat_dir),
"jobs": [
{
"job": j.job,
"healthy": j.healthy,
"age_seconds": j.age_seconds,
"interval_seconds": j.interval_seconds,
"staleness_ratio": j.staleness_ratio,
"raw_status": j.raw_status,
"message": j.message,
}
for j in self.jobs
],
}
def _fmt_duration(seconds: float) -> str:
"""Format a duration in seconds as a human-readable string."""
s = int(seconds)
if s < 60:
return f"{s}s"
if s < 3600:
return f"{s // 60}m {s % 60}s"
return f"{s // 3600}h {(s % 3600) // 60}m"
# ── Job scanning ──────────────────────────────────────────────────────
def scan_heartbeats(directory: Path) -> List[JobStatus]:
"""Read every ``*.last`` file in *directory* and return their statuses."""
if not directory.exists():
return []
return [_read_job_status(p.stem, p) for p in sorted(directory.glob("*.last"))]
def _read_job_status(job: str, path: Path) -> JobStatus:
"""Parse one ``.last`` file and produce a ``JobStatus``."""
now = time.time()
if not path.exists():
return JobStatus(
job=job, path=path,
healthy=False,
age_seconds=-1,
interval_seconds=0,
staleness_ratio=-1,
last_timestamp=None,
pid=None,
raw_status="missing",
message=f"Heartbeat file missing: {path}",
)
try:
data = json.loads(path.read_text())
except (json.JSONDecodeError, OSError) as exc:
return JobStatus(
job=job, path=path,
healthy=False,
age_seconds=-1,
interval_seconds=0,
staleness_ratio=-1,
last_timestamp=None,
pid=None,
raw_status="corrupt",
message=f"Corrupt heartbeat: {exc}",
)
timestamp = float(data.get("timestamp", 0))
interval = int(data.get("interval_seconds", 0))
pid = data.get("pid")
raw_status = data.get("status", "ok")
age = now - timestamp
ratio = age / interval if interval > 0 else float("inf")
stale = ratio > STALE_RATIO and age > MIN_STALE_AGE
if stale:
message = (
f"Silent for {_fmt_duration(age)} "
f"({ratio:.1f}x interval of {_fmt_duration(interval)})"
)
else:
message = f"Last beat {_fmt_duration(age)} ago (ratio {ratio:.1f}x)"
return JobStatus(
job=job, path=path,
healthy=not stale,
age_seconds=age,
interval_seconds=interval,
staleness_ratio=ratio,
last_timestamp=timestamp,
pid=pid,
raw_status=raw_status if not stale else "stale",
message=message,
)
# ── Gitea alerting ────────────────────────────────────────────────────
def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any:
"""Make a Gitea API request; return parsed JSON or None on error."""
import urllib.request
import urllib.error
url = f"{GITEA_URL.rstrip('/')}/api/v1{path}"
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, method=method)
if GITEA_TOKEN:
req.add_header("Authorization", f"token {GITEA_TOKEN}")
req.add_header("Content-Type", "application/json")
req.add_header("Accept", "application/json")
try:
with urllib.request.urlopen(req, timeout=15) as resp:
raw = resp.read().decode()
return json.loads(raw) if raw.strip() else {}
except urllib.error.HTTPError as exc:
logger.warning("Gitea %d: %s", exc.code, exc.read().decode()[:200])
return None
except Exception as exc:
logger.warning("Gitea request failed: %s", exc)
return None
def _find_open_alert_issue() -> Optional[dict]:
issues = _gitea_request(
"GET",
f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20",
)
if not isinstance(issues, list):
return None
for issue in issues:
if issue.get("title", "").startswith(ALERT_TITLE_PREFIX):
return issue
return None
def alert_on_stale(report: HeartbeatReport, dry_run: bool = False) -> None:
"""Create, update, or close a Gitea alert issue based on report health."""
if dry_run:
action = "close" if report.overall_healthy else "create/update"
logger.info("DRY RUN — would %s Gitea issue", action)
return
if not GITEA_TOKEN:
logger.warning("GITEA_TOKEN not set — skipping Gitea alert")
return
existing = _find_open_alert_issue()
if report.overall_healthy:
if existing:
logger.info("All heartbeats healthy — closing issue #%d", existing["number"])
_gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
data={"body": "All cron heartbeats are now fresh. Closing."},
)
_gitea_request(
"PATCH",
f"/repos/{GITEA_REPO}/issues/{existing['number']}",
data={"state": "closed"},
)
return
stale_names = ", ".join(j.job for j in report.stale_jobs)
title = f"{ALERT_TITLE_PREFIX} Stale cron heartbeats: {stale_names}"
body = report.to_alert_body()
if existing:
logger.info("Still stale — updating issue #%d", existing["number"])
_gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments",
data={"body": body},
)
else:
result = _gitea_request(
"POST",
f"/repos/{GITEA_REPO}/issues",
data={"title": title, "body": body, "assignees": ["Timmy"]},
)
if result and result.get("number"):
logger.info("Created alert issue #%d", result["number"])
# ── Entry point ───────────────────────────────────────────────────────
def build_report(directory: Optional[Path] = None) -> HeartbeatReport:
"""Scan heartbeats and return a report. Exposed for Night Watch import."""
hb_dir = directory if directory is not None else _resolve_heartbeat_dir()
jobs = scan_heartbeats(hb_dir)
return HeartbeatReport(timestamp=time.time(), heartbeat_dir=hb_dir, jobs=jobs)
def main() -> None:
parser = argparse.ArgumentParser(
description="Meta-heartbeat checker — detects silent cron failures",
)
parser.add_argument(
"--dir", default=None,
help="Heartbeat directory (default: auto-detect)",
)
parser.add_argument(
"--panel", action="store_true",
help="Output Night Watch heartbeat panel markdown and exit",
)
parser.add_argument(
"--json", action="store_true", dest="output_json",
help="Output results as JSON and exit",
)
parser.add_argument(
"--dry-run", action="store_true",
help="Log results without writing Gitea issues",
)
args = parser.parse_args()
report = build_report(Path(args.dir) if args.dir else None)
if args.panel:
print(report.to_panel_markdown())
return
if args.output_json:
print(json.dumps(report.to_json(), indent=2))
sys.exit(0 if report.overall_healthy else 1)
# Default: log + alert
if not report.jobs:
logger.info("No heartbeat files found in %s", report.heartbeat_dir)
else:
for j in report.jobs:
level = logging.INFO if j.healthy else logging.ERROR
icon = "OK " if j.healthy else "STALE"
logger.log(level, "[%s] %s: %s", icon, j.job, j.message)
alert_on_stale(report, dry_run=args.dry_run)
sys.exit(0 if report.overall_healthy else 1)
if __name__ == "__main__":
main()