#!/usr/bin/env python3 """Meta-heartbeat checker — makes silent cron failures impossible. Reads every ``*.last`` file in the heartbeat directory and verifies that no job has been silent for longer than **2× its declared interval**. If any job is stale, a Gitea alert issue is created (or an existing one is updated). When all jobs recover, the issue is closed automatically. This script itself should be run as a cron job every 15 minutes so the meta-level is also covered: */15 * * * * cd /path/to/the-nexus && \\ python bin/check_cron_heartbeats.py >> /var/log/bezalel/heartbeat-check.log 2>&1 USAGE ----- # Check all jobs; create/update Gitea alert if any stale: python bin/check_cron_heartbeats.py # Dry-run (no Gitea writes): python bin/check_cron_heartbeats.py --dry-run # Output Night Watch heartbeat panel markdown: python bin/check_cron_heartbeats.py --panel # Output JSON (for integration with other tools): python bin/check_cron_heartbeats.py --json # Use a custom heartbeat directory: python bin/check_cron_heartbeats.py --dir /tmp/test-heartbeats HEARTBEAT DIRECTORY ------------------- Primary: /var/run/bezalel/heartbeats/ (set by ops, writable by cron user) Fallback: ~/.bezalel/heartbeats/ (dev machines) Override: BEZALEL_HEARTBEAT_DIR env var ZERO DEPENDENCIES ----------------- Pure stdlib. No pip installs required. Refs: #1096 """ from __future__ import annotations import argparse import json import logging import os import sys import time from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-7s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger("bezalel.heartbeat_checker") # ── Configuration ───────────────────────────────────────────────────── PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats") FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats" GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com") GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "") GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus") ALERT_TITLE_PREFIX = "[heartbeat-checker]" # A job is stale when its age exceeds this multiple of its declared interval STALE_RATIO = 2.0 # Never flag a job as stale if it completed less than this many seconds ago # (prevents noise immediately after deployment) MIN_STALE_AGE = 60 def _resolve_heartbeat_dir() -> Path: """Return the active heartbeat directory.""" env = os.environ.get("BEZALEL_HEARTBEAT_DIR") if env: return Path(env) if PRIMARY_HEARTBEAT_DIR.exists(): return PRIMARY_HEARTBEAT_DIR # Try to create it; fall back to home dir if not permitted try: PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True) probe = PRIMARY_HEARTBEAT_DIR / ".write_probe" probe.touch() probe.unlink() return PRIMARY_HEARTBEAT_DIR except (PermissionError, OSError): return FALLBACK_HEARTBEAT_DIR # ── Data model ──────────────────────────────────────────────────────── @dataclass class JobStatus: """Health status for a single cron job's heartbeat.""" job: str path: Path healthy: bool age_seconds: float # -1 if unknown (missing/corrupt) interval_seconds: int # 0 if unknown staleness_ratio: float # age / interval; -1 if unknown; >STALE_RATIO = stale last_timestamp: Optional[float] pid: Optional[int] raw_status: str # value from the .last file: "ok" / "warn" / "error" message: str @dataclass class HeartbeatReport: """Aggregate report for all cron job heartbeats in a directory.""" timestamp: float heartbeat_dir: Path jobs: List[JobStatus] = field(default_factory=list) @property def stale_jobs(self) -> List[JobStatus]: return [j for j in self.jobs if not j.healthy] @property def overall_healthy(self) -> bool: return len(self.stale_jobs) == 0 # ── Rendering ───────────────────────────────────────────────────── def to_panel_markdown(self) -> str: """Night Watch heartbeat panel — a table of all jobs with their status.""" ts = time.strftime("%Y-%m-%d %H:%M UTC", time.gmtime(self.timestamp)) overall = "OK" if self.overall_healthy else "ALERT" lines = [ f"## Heartbeat Panel — {ts}", "", f"**Overall:** {overall}", "", "| Job | Status | Age | Interval | Ratio |", "|-----|--------|-----|----------|-------|", ] if not self.jobs: lines.append("| *(no heartbeat files found)* | — | — | — | — |") else: for j in self.jobs: icon = "OK" if j.healthy else "STALE" age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A" interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A" ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A" lines.append( f"| `{j.job}` | {icon} | {age_str} | {interval_str} | {ratio_str} |" ) if self.stale_jobs: lines += ["", "**Stale jobs:**"] for j in self.stale_jobs: lines.append(f"- `{j.job}`: {j.message}") lines += [ "", f"*Heartbeat dir: `{self.heartbeat_dir}`*", ] return "\n".join(lines) def to_alert_body(self) -> str: """Gitea issue body when stale jobs are detected.""" ts = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(self.timestamp)) stale = self.stale_jobs lines = [ f"## Cron Heartbeat Alert — {ts}", "", f"**{len(stale)} job(s) have gone silent** (stale > {STALE_RATIO}x interval).", "", "| Job | Age | Interval | Ratio | Detail |", "|-----|-----|----------|-------|--------|", ] for j in stale: age_str = _fmt_duration(j.age_seconds) if j.age_seconds >= 0 else "N/A" interval_str = _fmt_duration(j.interval_seconds) if j.interval_seconds > 0 else "N/A" ratio_str = f"{j.staleness_ratio:.1f}x" if j.staleness_ratio >= 0 else "N/A" lines.append( f"| `{j.job}` | {age_str} | {interval_str} | {ratio_str} | {j.message} |" ) lines += [ "", "### What to do", "1. `crontab -l` — confirm the job is still scheduled", "2. Check the job's log for errors", "3. Restart the job if needed", "4. Close this issue once fresh heartbeats appear", "", f"*Generated by `check_cron_heartbeats.py` — dir: `{self.heartbeat_dir}`*", ] return "\n".join(lines) def to_json(self) -> Dict[str, Any]: return { "healthy": self.overall_healthy, "timestamp": self.timestamp, "heartbeat_dir": str(self.heartbeat_dir), "jobs": [ { "job": j.job, "healthy": j.healthy, "age_seconds": j.age_seconds, "interval_seconds": j.interval_seconds, "staleness_ratio": j.staleness_ratio, "raw_status": j.raw_status, "message": j.message, } for j in self.jobs ], } def _fmt_duration(seconds: float) -> str: """Format a duration in seconds as a human-readable string.""" s = int(seconds) if s < 60: return f"{s}s" if s < 3600: return f"{s // 60}m {s % 60}s" return f"{s // 3600}h {(s % 3600) // 60}m" # ── Job scanning ────────────────────────────────────────────────────── def scan_heartbeats(directory: Path) -> List[JobStatus]: """Read every ``*.last`` file in *directory* and return their statuses.""" if not directory.exists(): return [] return [_read_job_status(p.stem, p) for p in sorted(directory.glob("*.last"))] def _read_job_status(job: str, path: Path) -> JobStatus: """Parse one ``.last`` file and produce a ``JobStatus``.""" now = time.time() if not path.exists(): return JobStatus( job=job, path=path, healthy=False, age_seconds=-1, interval_seconds=0, staleness_ratio=-1, last_timestamp=None, pid=None, raw_status="missing", message=f"Heartbeat file missing: {path}", ) try: data = json.loads(path.read_text()) except (json.JSONDecodeError, OSError) as exc: return JobStatus( job=job, path=path, healthy=False, age_seconds=-1, interval_seconds=0, staleness_ratio=-1, last_timestamp=None, pid=None, raw_status="corrupt", message=f"Corrupt heartbeat: {exc}", ) timestamp = float(data.get("timestamp", 0)) interval = int(data.get("interval_seconds", 0)) pid = data.get("pid") raw_status = data.get("status", "ok") age = now - timestamp ratio = age / interval if interval > 0 else float("inf") stale = ratio > STALE_RATIO and age > MIN_STALE_AGE if stale: message = ( f"Silent for {_fmt_duration(age)} " f"({ratio:.1f}x interval of {_fmt_duration(interval)})" ) else: message = f"Last beat {_fmt_duration(age)} ago (ratio {ratio:.1f}x)" return JobStatus( job=job, path=path, healthy=not stale, age_seconds=age, interval_seconds=interval, staleness_ratio=ratio, last_timestamp=timestamp, pid=pid, raw_status=raw_status if not stale else "stale", message=message, ) # ── Gitea alerting ──────────────────────────────────────────────────── def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any: """Make a Gitea API request; return parsed JSON or None on error.""" import urllib.request import urllib.error url = f"{GITEA_URL.rstrip('/')}/api/v1{path}" body = json.dumps(data).encode() if data else None req = urllib.request.Request(url, data=body, method=method) if GITEA_TOKEN: req.add_header("Authorization", f"token {GITEA_TOKEN}") req.add_header("Content-Type", "application/json") req.add_header("Accept", "application/json") try: with urllib.request.urlopen(req, timeout=15) as resp: raw = resp.read().decode() return json.loads(raw) if raw.strip() else {} except urllib.error.HTTPError as exc: logger.warning("Gitea %d: %s", exc.code, exc.read().decode()[:200]) return None except Exception as exc: logger.warning("Gitea request failed: %s", exc) return None def _find_open_alert_issue() -> Optional[dict]: issues = _gitea_request( "GET", f"/repos/{GITEA_REPO}/issues?state=open&type=issues&limit=20", ) if not isinstance(issues, list): return None for issue in issues: if issue.get("title", "").startswith(ALERT_TITLE_PREFIX): return issue return None def alert_on_stale(report: HeartbeatReport, dry_run: bool = False) -> None: """Create, update, or close a Gitea alert issue based on report health.""" if dry_run: action = "close" if report.overall_healthy else "create/update" logger.info("DRY RUN — would %s Gitea issue", action) return if not GITEA_TOKEN: logger.warning("GITEA_TOKEN not set — skipping Gitea alert") return existing = _find_open_alert_issue() if report.overall_healthy: if existing: logger.info("All heartbeats healthy — closing issue #%d", existing["number"]) _gitea_request( "POST", f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments", data={"body": "All cron heartbeats are now fresh. Closing."}, ) _gitea_request( "PATCH", f"/repos/{GITEA_REPO}/issues/{existing['number']}", data={"state": "closed"}, ) return stale_names = ", ".join(j.job for j in report.stale_jobs) title = f"{ALERT_TITLE_PREFIX} Stale cron heartbeats: {stale_names}" body = report.to_alert_body() if existing: logger.info("Still stale — updating issue #%d", existing["number"]) _gitea_request( "POST", f"/repos/{GITEA_REPO}/issues/{existing['number']}/comments", data={"body": body}, ) else: result = _gitea_request( "POST", f"/repos/{GITEA_REPO}/issues", data={"title": title, "body": body, "assignees": ["Timmy"]}, ) if result and result.get("number"): logger.info("Created alert issue #%d", result["number"]) # ── Entry point ─────────────────────────────────────────────────────── def build_report(directory: Optional[Path] = None) -> HeartbeatReport: """Scan heartbeats and return a report. Exposed for Night Watch import.""" hb_dir = directory if directory is not None else _resolve_heartbeat_dir() jobs = scan_heartbeats(hb_dir) return HeartbeatReport(timestamp=time.time(), heartbeat_dir=hb_dir, jobs=jobs) def main() -> None: parser = argparse.ArgumentParser( description="Meta-heartbeat checker — detects silent cron failures", ) parser.add_argument( "--dir", default=None, help="Heartbeat directory (default: auto-detect)", ) parser.add_argument( "--panel", action="store_true", help="Output Night Watch heartbeat panel markdown and exit", ) parser.add_argument( "--json", action="store_true", dest="output_json", help="Output results as JSON and exit", ) parser.add_argument( "--dry-run", action="store_true", help="Log results without writing Gitea issues", ) args = parser.parse_args() report = build_report(Path(args.dir) if args.dir else None) if args.panel: print(report.to_panel_markdown()) return if args.output_json: print(json.dumps(report.to_json(), indent=2)) sys.exit(0 if report.overall_healthy else 1) # Default: log + alert if not report.jobs: logger.info("No heartbeat files found in %s", report.heartbeat_dir) else: for j in report.jobs: level = logging.INFO if j.healthy else logging.ERROR icon = "OK " if j.healthy else "STALE" logger.log(level, "[%s] %s: %s", icon, j.job, j.message) alert_on_stale(report, dry_run=args.dry_run) sys.exit(0 if report.overall_healthy else 1) if __name__ == "__main__": main()