Files
the-nexus/bin/bezalel_heartbeat_check.py
2026-04-07 14:44:05 +00:00

327 lines
10 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Bezalel Meta-Heartbeat Checker — stale cron detection (poka-yoke #1096)
Monitors all cron job heartbeat files and alerts P1 when any job has been
silent for more than 2× its declared interval.
POKA-YOKE design:
Prevention — cron-heartbeat-write.sh writes a .last file atomically after
every successful cron job completion, stamping its interval.
Detection — this script runs every 15 minutes (via systemd timer) and
raises P1 on stderr + writes an alert file for any stale job.
Correction — alerts are loud enough (P1 stderr + alert files) for
monitoring/humans to intervene before the next run window.
ZERO DEPENDENCIES
=================
Pure stdlib. No pip installs.
USAGE
=====
# One-shot check (default dir)
python bin/bezalel_heartbeat_check.py
# Override heartbeat dir
python bin/bezalel_heartbeat_check.py --heartbeat-dir /tmp/test-beats
# Dry-run (check + report, don't write alert files)
python bin/bezalel_heartbeat_check.py --dry-run
# JSON output (for piping into other tools)
python bin/bezalel_heartbeat_check.py --json
EXIT CODES
==========
0 — all jobs healthy (or no .last files found yet)
1 — one or more stale beats detected
2 — heartbeat dir unreadable
IMPORTABLE API
==============
from bin.bezalel_heartbeat_check import check_cron_heartbeats
result = check_cron_heartbeats("/var/run/bezalel/heartbeats")
# Returns dict with keys: checked_at, jobs, stale_count, healthy_count
Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-7s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("bezalel.heartbeat")
# ── Configuration ────────────────────────────────────────────────────
DEFAULT_HEARTBEAT_DIR = "/var/run/bezalel/heartbeats"
# ── Core checker ─────────────────────────────────────────────────────
def check_cron_heartbeats(heartbeat_dir: str = DEFAULT_HEARTBEAT_DIR) -> Dict[str, Any]:
"""
Scan all .last files in heartbeat_dir and determine which jobs are stale.
Returns a dict:
{
"checked_at": "<ISO 8601 timestamp>",
"jobs": [
{
"job": str,
"healthy": bool,
"age_secs": float,
"interval": int,
"last_seen": str or None, # ISO timestamp of last heartbeat
"message": str,
},
...
],
"stale_count": int,
"healthy_count": int,
}
On empty dir (no .last files), returns jobs=[] with stale_count=0.
On corrupt .last file, reports that job as stale with an error message.
Refs: #1096
"""
now_ts = time.time()
checked_at = datetime.fromtimestamp(now_ts, tz=timezone.utc).isoformat()
hb_path = Path(heartbeat_dir)
jobs: List[Dict[str, Any]] = []
if not hb_path.exists():
return {
"checked_at": checked_at,
"jobs": [],
"stale_count": 0,
"healthy_count": 0,
}
last_files = sorted(hb_path.glob("*.last"))
for last_file in last_files:
job_name = last_file.stem # filename without .last extension
# Read and parse the heartbeat file
try:
raw = last_file.read_text(encoding="utf-8")
data = json.loads(raw)
except (OSError, json.JSONDecodeError) as exc:
jobs.append({
"job": job_name,
"healthy": False,
"age_secs": float("inf"),
"interval": 3600,
"last_seen": None,
"message": f"CORRUPT: cannot read/parse heartbeat file: {exc}",
})
continue
# Extract fields with safe defaults
beat_timestamp = float(data.get("timestamp", 0))
interval = int(data.get("interval", 3600))
pid = data.get("pid", "?")
age_secs = now_ts - beat_timestamp
# Convert beat_timestamp to a readable ISO string
try:
last_seen = datetime.fromtimestamp(beat_timestamp, tz=timezone.utc).isoformat()
except (OSError, OverflowError, ValueError):
last_seen = None
# Stale = silent for more than 2× the declared interval
threshold = 2 * interval
is_stale = age_secs > threshold
if is_stale:
message = (
f"STALE (last {age_secs:.0f}s ago, interval {interval}s"
f" — exceeds 2x threshold of {threshold}s)"
)
else:
message = f"OK (last {age_secs:.0f}s ago, interval {interval}s)"
jobs.append({
"job": job_name,
"healthy": not is_stale,
"age_secs": age_secs,
"interval": interval,
"last_seen": last_seen,
"message": message,
})
stale_count = sum(1 for j in jobs if not j["healthy"])
healthy_count = sum(1 for j in jobs if j["healthy"])
return {
"checked_at": checked_at,
"jobs": jobs,
"stale_count": stale_count,
"healthy_count": healthy_count,
}
# ── Alert file writer ────────────────────────────────────────────────
def write_alert(heartbeat_dir: str, job_info: Dict[str, Any]) -> None:
"""
Write an alert file for a stale job to <heartbeat_dir>/alerts/<job>.alert
Alert files are watched by external monitoring. They persist until the
job runs again and clears stale status on the next check cycle.
Refs: #1096
"""
alerts_dir = Path(heartbeat_dir) / "alerts"
try:
alerts_dir.mkdir(parents=True, exist_ok=True)
except OSError as exc:
logger.warning("Cannot create alerts dir %s: %s", alerts_dir, exc)
return
alert_file = alerts_dir / f"{job_info['job']}.alert"
now_str = datetime.now(tz=timezone.utc).isoformat()
content = {
"alert_level": "P1",
"job": job_info["job"],
"message": job_info["message"],
"age_secs": job_info["age_secs"],
"interval": job_info["interval"],
"last_seen": job_info["last_seen"],
"detected_at": now_str,
}
# Atomic write via temp + rename (same poka-yoke pattern as the writer)
tmp_file = alert_file.with_suffix(f".alert.tmp.{os.getpid()}")
try:
tmp_file.write_text(json.dumps(content, indent=2), encoding="utf-8")
tmp_file.rename(alert_file)
except OSError as exc:
logger.warning("Failed to write alert file %s: %s", alert_file, exc)
tmp_file.unlink(missing_ok=True)
# ── Main runner ──────────────────────────────────────────────────────
def run_check(heartbeat_dir: str, dry_run: bool = False, output_json: bool = False) -> int:
"""
Run a full heartbeat check cycle. Returns exit code (0/1/2).
Exit codes:
0 — all healthy (or no .last files found yet)
1 — stale beats detected
2 — heartbeat dir unreadable (permissions, etc.)
Refs: #1096
"""
hb_path = Path(heartbeat_dir)
# Check if dir exists but is unreadable (permissions)
if hb_path.exists() and not os.access(heartbeat_dir, os.R_OK):
logger.error("Heartbeat dir unreadable: %s", heartbeat_dir)
return 2
result = check_cron_heartbeats(heartbeat_dir)
if output_json:
print(json.dumps(result, indent=2))
return 1 if result["stale_count"] > 0 else 0
# Human-readable output
if not result["jobs"]:
logger.warning(
"No .last files found in %s — bezalel not yet provisioned or no jobs registered.",
heartbeat_dir,
)
return 0
for job in result["jobs"]:
if job["healthy"]:
logger.info(" + %s: %s", job["job"], job["message"])
else:
logger.error(" - %s: %s", job["job"], job["message"])
if result["stale_count"] > 0:
for job in result["jobs"]:
if not job["healthy"]:
# P1 alert to stderr
print(
f"[P1-ALERT] STALE CRON JOB: {job['job']}{job['message']}",
file=sys.stderr,
)
if not dry_run:
write_alert(heartbeat_dir, job)
else:
logger.info("DRY RUN — would write alert for stale job: %s", job["job"])
logger.error(
"Heartbeat check FAILED: %d stale, %d healthy",
result["stale_count"],
result["healthy_count"],
)
return 1
logger.info(
"Heartbeat check PASSED: %d healthy, %d stale",
result["healthy_count"],
result["stale_count"],
)
return 0
# ── CLI entrypoint ───────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description=(
"Bezalel Meta-Heartbeat Checker — detect silent cron failures (poka-yoke #1096)"
),
)
parser.add_argument(
"--heartbeat-dir",
default=DEFAULT_HEARTBEAT_DIR,
help=f"Directory containing .last heartbeat files (default: {DEFAULT_HEARTBEAT_DIR})",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Check and report but do not write alert files",
)
parser.add_argument(
"--json",
action="store_true",
dest="output_json",
help="Output results as JSON (for integration with other tools)",
)
args = parser.parse_args()
exit_code = run_check(
heartbeat_dir=args.heartbeat_dir,
dry_run=args.dry_run,
output_json=args.output_json,
)
sys.exit(exit_code)
if __name__ == "__main__":
main()