#!/usr/bin/env python3 """ Bezalel Meta-Heartbeat Checker — stale cron detection (poka-yoke #1096) Monitors all cron job heartbeat files and alerts P1 when any job has been silent for more than 2× its declared interval. POKA-YOKE design: Prevention — cron-heartbeat-write.sh writes a .last file atomically after every successful cron job completion, stamping its interval. Detection — this script runs every 15 minutes (via systemd timer) and raises P1 on stderr + writes an alert file for any stale job. Correction — alerts are loud enough (P1 stderr + alert files) for monitoring/humans to intervene before the next run window. ZERO DEPENDENCIES ================= Pure stdlib. No pip installs. USAGE ===== # One-shot check (default dir) python bin/bezalel_heartbeat_check.py # Override heartbeat dir python bin/bezalel_heartbeat_check.py --heartbeat-dir /tmp/test-beats # Dry-run (check + report, don't write alert files) python bin/bezalel_heartbeat_check.py --dry-run # JSON output (for piping into other tools) python bin/bezalel_heartbeat_check.py --json EXIT CODES ========== 0 — all jobs healthy (or no .last files found yet) 1 — one or more stale beats detected 2 — heartbeat dir unreadable IMPORTABLE API ============== from bin.bezalel_heartbeat_check import check_cron_heartbeats result = check_cron_heartbeats("/var/run/bezalel/heartbeats") # Returns dict with keys: checked_at, jobs, stale_count, healthy_count Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096 """ from __future__ import annotations import argparse import json import logging import os import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-7s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger("bezalel.heartbeat") # ── Configuration ──────────────────────────────────────────────────── DEFAULT_HEARTBEAT_DIR = "/var/run/bezalel/heartbeats" # ── Core checker ───────────────────────────────────────────────────── def check_cron_heartbeats(heartbeat_dir: str = DEFAULT_HEARTBEAT_DIR) -> Dict[str, Any]: """ Scan all .last files in heartbeat_dir and determine which jobs are stale. Returns a dict: { "checked_at": "", "jobs": [ { "job": str, "healthy": bool, "age_secs": float, "interval": int, "last_seen": str or None, # ISO timestamp of last heartbeat "message": str, }, ... ], "stale_count": int, "healthy_count": int, } On empty dir (no .last files), returns jobs=[] with stale_count=0. On corrupt .last file, reports that job as stale with an error message. Refs: #1096 """ now_ts = time.time() checked_at = datetime.fromtimestamp(now_ts, tz=timezone.utc).isoformat() hb_path = Path(heartbeat_dir) jobs: List[Dict[str, Any]] = [] if not hb_path.exists(): return { "checked_at": checked_at, "jobs": [], "stale_count": 0, "healthy_count": 0, } last_files = sorted(hb_path.glob("*.last")) for last_file in last_files: job_name = last_file.stem # filename without .last extension # Read and parse the heartbeat file try: raw = last_file.read_text(encoding="utf-8") data = json.loads(raw) except (OSError, json.JSONDecodeError) as exc: jobs.append({ "job": job_name, "healthy": False, "age_secs": float("inf"), "interval": 3600, "last_seen": None, "message": f"CORRUPT: cannot read/parse heartbeat file: {exc}", }) continue # Extract fields with safe defaults beat_timestamp = float(data.get("timestamp", 0)) interval = int(data.get("interval", 3600)) pid = data.get("pid", "?") age_secs = now_ts - beat_timestamp # Convert beat_timestamp to a readable ISO string try: last_seen = datetime.fromtimestamp(beat_timestamp, tz=timezone.utc).isoformat() except (OSError, OverflowError, ValueError): last_seen = None # Stale = silent for more than 2× the declared interval threshold = 2 * interval is_stale = age_secs > threshold if is_stale: message = ( f"STALE (last {age_secs:.0f}s ago, interval {interval}s" f" — exceeds 2x threshold of {threshold}s)" ) else: message = f"OK (last {age_secs:.0f}s ago, interval {interval}s)" jobs.append({ "job": job_name, "healthy": not is_stale, "age_secs": age_secs, "interval": interval, "last_seen": last_seen, "message": message, }) stale_count = sum(1 for j in jobs if not j["healthy"]) healthy_count = sum(1 for j in jobs if j["healthy"]) return { "checked_at": checked_at, "jobs": jobs, "stale_count": stale_count, "healthy_count": healthy_count, } # ── Alert file writer ──────────────────────────────────────────────── def write_alert(heartbeat_dir: str, job_info: Dict[str, Any]) -> None: """ Write an alert file for a stale job to /alerts/.alert Alert files are watched by external monitoring. They persist until the job runs again and clears stale status on the next check cycle. Refs: #1096 """ alerts_dir = Path(heartbeat_dir) / "alerts" try: alerts_dir.mkdir(parents=True, exist_ok=True) except OSError as exc: logger.warning("Cannot create alerts dir %s: %s", alerts_dir, exc) return alert_file = alerts_dir / f"{job_info['job']}.alert" now_str = datetime.now(tz=timezone.utc).isoformat() content = { "alert_level": "P1", "job": job_info["job"], "message": job_info["message"], "age_secs": job_info["age_secs"], "interval": job_info["interval"], "last_seen": job_info["last_seen"], "detected_at": now_str, } # Atomic write via temp + rename (same poka-yoke pattern as the writer) tmp_file = alert_file.with_suffix(f".alert.tmp.{os.getpid()}") try: tmp_file.write_text(json.dumps(content, indent=2), encoding="utf-8") tmp_file.rename(alert_file) except OSError as exc: logger.warning("Failed to write alert file %s: %s", alert_file, exc) tmp_file.unlink(missing_ok=True) # ── Main runner ────────────────────────────────────────────────────── def run_check(heartbeat_dir: str, dry_run: bool = False, output_json: bool = False) -> int: """ Run a full heartbeat check cycle. Returns exit code (0/1/2). Exit codes: 0 — all healthy (or no .last files found yet) 1 — stale beats detected 2 — heartbeat dir unreadable (permissions, etc.) Refs: #1096 """ hb_path = Path(heartbeat_dir) # Check if dir exists but is unreadable (permissions) if hb_path.exists() and not os.access(heartbeat_dir, os.R_OK): logger.error("Heartbeat dir unreadable: %s", heartbeat_dir) return 2 result = check_cron_heartbeats(heartbeat_dir) if output_json: print(json.dumps(result, indent=2)) return 1 if result["stale_count"] > 0 else 0 # Human-readable output if not result["jobs"]: logger.warning( "No .last files found in %s — bezalel not yet provisioned or no jobs registered.", heartbeat_dir, ) return 0 for job in result["jobs"]: if job["healthy"]: logger.info(" + %s: %s", job["job"], job["message"]) else: logger.error(" - %s: %s", job["job"], job["message"]) if result["stale_count"] > 0: for job in result["jobs"]: if not job["healthy"]: # P1 alert to stderr print( f"[P1-ALERT] STALE CRON JOB: {job['job']} — {job['message']}", file=sys.stderr, ) if not dry_run: write_alert(heartbeat_dir, job) else: logger.info("DRY RUN — would write alert for stale job: %s", job["job"]) logger.error( "Heartbeat check FAILED: %d stale, %d healthy", result["stale_count"], result["healthy_count"], ) return 1 logger.info( "Heartbeat check PASSED: %d healthy, %d stale", result["healthy_count"], result["stale_count"], ) return 0 # ── CLI entrypoint ─────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser( description=( "Bezalel Meta-Heartbeat Checker — detect silent cron failures (poka-yoke #1096)" ), ) parser.add_argument( "--heartbeat-dir", default=DEFAULT_HEARTBEAT_DIR, help=f"Directory containing .last heartbeat files (default: {DEFAULT_HEARTBEAT_DIR})", ) parser.add_argument( "--dry-run", action="store_true", help="Check and report but do not write alert files", ) parser.add_argument( "--json", action="store_true", dest="output_json", help="Output results as JSON (for integration with other tools)", ) args = parser.parse_args() exit_code = run_check( heartbeat_dir=args.heartbeat_dir, dry_run=args.dry_run, output_json=args.output_json, ) sys.exit(exit_code) if __name__ == "__main__": main()