327 lines
10 KiB
Python
Executable File
327 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Bezalel Meta-Heartbeat Checker — stale cron detection (poka-yoke #1096)
|
||
|
||
Monitors all cron job heartbeat files and alerts P1 when any job has been
|
||
silent for more than 2× its declared interval.
|
||
|
||
POKA-YOKE design:
|
||
Prevention — cron-heartbeat-write.sh writes a .last file atomically after
|
||
every successful cron job completion, stamping its interval.
|
||
Detection — this script runs every 15 minutes (via systemd timer) and
|
||
raises P1 on stderr + writes an alert file for any stale job.
|
||
Correction — alerts are loud enough (P1 stderr + alert files) for
|
||
monitoring/humans to intervene before the next run window.
|
||
|
||
ZERO DEPENDENCIES
|
||
=================
|
||
Pure stdlib. No pip installs.
|
||
|
||
USAGE
|
||
=====
|
||
# One-shot check (default dir)
|
||
python bin/bezalel_heartbeat_check.py
|
||
|
||
# Override heartbeat dir
|
||
python bin/bezalel_heartbeat_check.py --heartbeat-dir /tmp/test-beats
|
||
|
||
# Dry-run (check + report, don't write alert files)
|
||
python bin/bezalel_heartbeat_check.py --dry-run
|
||
|
||
# JSON output (for piping into other tools)
|
||
python bin/bezalel_heartbeat_check.py --json
|
||
|
||
EXIT CODES
|
||
==========
|
||
0 — all jobs healthy (or no .last files found yet)
|
||
1 — one or more stale beats detected
|
||
2 — heartbeat dir unreadable
|
||
|
||
IMPORTABLE API
|
||
==============
|
||
from bin.bezalel_heartbeat_check import check_cron_heartbeats
|
||
|
||
result = check_cron_heartbeats("/var/run/bezalel/heartbeats")
|
||
# Returns dict with keys: checked_at, jobs, stale_count, healthy_count
|
||
|
||
Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import sys
|
||
import time
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)-7s %(message)s",
|
||
datefmt="%Y-%m-%d %H:%M:%S",
|
||
)
|
||
logger = logging.getLogger("bezalel.heartbeat")
|
||
|
||
# ── Configuration ────────────────────────────────────────────────────
|
||
|
||
DEFAULT_HEARTBEAT_DIR = "/var/run/bezalel/heartbeats"
|
||
|
||
|
||
# ── Core checker ─────────────────────────────────────────────────────
|
||
|
||
def check_cron_heartbeats(heartbeat_dir: str = DEFAULT_HEARTBEAT_DIR) -> Dict[str, Any]:
|
||
"""
|
||
Scan all .last files in heartbeat_dir and determine which jobs are stale.
|
||
|
||
Returns a dict:
|
||
{
|
||
"checked_at": "<ISO 8601 timestamp>",
|
||
"jobs": [
|
||
{
|
||
"job": str,
|
||
"healthy": bool,
|
||
"age_secs": float,
|
||
"interval": int,
|
||
"last_seen": str or None, # ISO timestamp of last heartbeat
|
||
"message": str,
|
||
},
|
||
...
|
||
],
|
||
"stale_count": int,
|
||
"healthy_count": int,
|
||
}
|
||
|
||
On empty dir (no .last files), returns jobs=[] with stale_count=0.
|
||
On corrupt .last file, reports that job as stale with an error message.
|
||
|
||
Refs: #1096
|
||
"""
|
||
now_ts = time.time()
|
||
checked_at = datetime.fromtimestamp(now_ts, tz=timezone.utc).isoformat()
|
||
|
||
hb_path = Path(heartbeat_dir)
|
||
jobs: List[Dict[str, Any]] = []
|
||
|
||
if not hb_path.exists():
|
||
return {
|
||
"checked_at": checked_at,
|
||
"jobs": [],
|
||
"stale_count": 0,
|
||
"healthy_count": 0,
|
||
}
|
||
|
||
last_files = sorted(hb_path.glob("*.last"))
|
||
|
||
for last_file in last_files:
|
||
job_name = last_file.stem # filename without .last extension
|
||
|
||
# Read and parse the heartbeat file
|
||
try:
|
||
raw = last_file.read_text(encoding="utf-8")
|
||
data = json.loads(raw)
|
||
except (OSError, json.JSONDecodeError) as exc:
|
||
jobs.append({
|
||
"job": job_name,
|
||
"healthy": False,
|
||
"age_secs": float("inf"),
|
||
"interval": 3600,
|
||
"last_seen": None,
|
||
"message": f"CORRUPT: cannot read/parse heartbeat file: {exc}",
|
||
})
|
||
continue
|
||
|
||
# Extract fields with safe defaults
|
||
beat_timestamp = float(data.get("timestamp", 0))
|
||
interval = int(data.get("interval", 3600))
|
||
pid = data.get("pid", "?")
|
||
|
||
age_secs = now_ts - beat_timestamp
|
||
|
||
# Convert beat_timestamp to a readable ISO string
|
||
try:
|
||
last_seen = datetime.fromtimestamp(beat_timestamp, tz=timezone.utc).isoformat()
|
||
except (OSError, OverflowError, ValueError):
|
||
last_seen = None
|
||
|
||
# Stale = silent for more than 2× the declared interval
|
||
threshold = 2 * interval
|
||
is_stale = age_secs > threshold
|
||
|
||
if is_stale:
|
||
message = (
|
||
f"STALE (last {age_secs:.0f}s ago, interval {interval}s"
|
||
f" — exceeds 2x threshold of {threshold}s)"
|
||
)
|
||
else:
|
||
message = f"OK (last {age_secs:.0f}s ago, interval {interval}s)"
|
||
|
||
jobs.append({
|
||
"job": job_name,
|
||
"healthy": not is_stale,
|
||
"age_secs": age_secs,
|
||
"interval": interval,
|
||
"last_seen": last_seen,
|
||
"message": message,
|
||
})
|
||
|
||
stale_count = sum(1 for j in jobs if not j["healthy"])
|
||
healthy_count = sum(1 for j in jobs if j["healthy"])
|
||
|
||
return {
|
||
"checked_at": checked_at,
|
||
"jobs": jobs,
|
||
"stale_count": stale_count,
|
||
"healthy_count": healthy_count,
|
||
}
|
||
|
||
|
||
# ── Alert file writer ────────────────────────────────────────────────
|
||
|
||
def write_alert(heartbeat_dir: str, job_info: Dict[str, Any]) -> None:
|
||
"""
|
||
Write an alert file for a stale job to <heartbeat_dir>/alerts/<job>.alert
|
||
|
||
Alert files are watched by external monitoring. They persist until the
|
||
job runs again and clears stale status on the next check cycle.
|
||
|
||
Refs: #1096
|
||
"""
|
||
alerts_dir = Path(heartbeat_dir) / "alerts"
|
||
try:
|
||
alerts_dir.mkdir(parents=True, exist_ok=True)
|
||
except OSError as exc:
|
||
logger.warning("Cannot create alerts dir %s: %s", alerts_dir, exc)
|
||
return
|
||
|
||
alert_file = alerts_dir / f"{job_info['job']}.alert"
|
||
now_str = datetime.now(tz=timezone.utc).isoformat()
|
||
|
||
content = {
|
||
"alert_level": "P1",
|
||
"job": job_info["job"],
|
||
"message": job_info["message"],
|
||
"age_secs": job_info["age_secs"],
|
||
"interval": job_info["interval"],
|
||
"last_seen": job_info["last_seen"],
|
||
"detected_at": now_str,
|
||
}
|
||
|
||
# Atomic write via temp + rename (same poka-yoke pattern as the writer)
|
||
tmp_file = alert_file.with_suffix(f".alert.tmp.{os.getpid()}")
|
||
try:
|
||
tmp_file.write_text(json.dumps(content, indent=2), encoding="utf-8")
|
||
tmp_file.rename(alert_file)
|
||
except OSError as exc:
|
||
logger.warning("Failed to write alert file %s: %s", alert_file, exc)
|
||
tmp_file.unlink(missing_ok=True)
|
||
|
||
|
||
# ── Main runner ──────────────────────────────────────────────────────
|
||
|
||
def run_check(heartbeat_dir: str, dry_run: bool = False, output_json: bool = False) -> int:
|
||
"""
|
||
Run a full heartbeat check cycle. Returns exit code (0/1/2).
|
||
|
||
Exit codes:
|
||
0 — all healthy (or no .last files found yet)
|
||
1 — stale beats detected
|
||
2 — heartbeat dir unreadable (permissions, etc.)
|
||
|
||
Refs: #1096
|
||
"""
|
||
hb_path = Path(heartbeat_dir)
|
||
|
||
# Check if dir exists but is unreadable (permissions)
|
||
if hb_path.exists() and not os.access(heartbeat_dir, os.R_OK):
|
||
logger.error("Heartbeat dir unreadable: %s", heartbeat_dir)
|
||
return 2
|
||
|
||
result = check_cron_heartbeats(heartbeat_dir)
|
||
|
||
if output_json:
|
||
print(json.dumps(result, indent=2))
|
||
return 1 if result["stale_count"] > 0 else 0
|
||
|
||
# Human-readable output
|
||
if not result["jobs"]:
|
||
logger.warning(
|
||
"No .last files found in %s — bezalel not yet provisioned or no jobs registered.",
|
||
heartbeat_dir,
|
||
)
|
||
return 0
|
||
|
||
for job in result["jobs"]:
|
||
if job["healthy"]:
|
||
logger.info(" + %s: %s", job["job"], job["message"])
|
||
else:
|
||
logger.error(" - %s: %s", job["job"], job["message"])
|
||
|
||
if result["stale_count"] > 0:
|
||
for job in result["jobs"]:
|
||
if not job["healthy"]:
|
||
# P1 alert to stderr
|
||
print(
|
||
f"[P1-ALERT] STALE CRON JOB: {job['job']} — {job['message']}",
|
||
file=sys.stderr,
|
||
)
|
||
if not dry_run:
|
||
write_alert(heartbeat_dir, job)
|
||
else:
|
||
logger.info("DRY RUN — would write alert for stale job: %s", job["job"])
|
||
|
||
logger.error(
|
||
"Heartbeat check FAILED: %d stale, %d healthy",
|
||
result["stale_count"],
|
||
result["healthy_count"],
|
||
)
|
||
return 1
|
||
|
||
logger.info(
|
||
"Heartbeat check PASSED: %d healthy, %d stale",
|
||
result["healthy_count"],
|
||
result["stale_count"],
|
||
)
|
||
return 0
|
||
|
||
|
||
# ── CLI entrypoint ───────────────────────────────────────────────────
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description=(
|
||
"Bezalel Meta-Heartbeat Checker — detect silent cron failures (poka-yoke #1096)"
|
||
),
|
||
)
|
||
parser.add_argument(
|
||
"--heartbeat-dir",
|
||
default=DEFAULT_HEARTBEAT_DIR,
|
||
help=f"Directory containing .last heartbeat files (default: {DEFAULT_HEARTBEAT_DIR})",
|
||
)
|
||
parser.add_argument(
|
||
"--dry-run",
|
||
action="store_true",
|
||
help="Check and report but do not write alert files",
|
||
)
|
||
parser.add_argument(
|
||
"--json",
|
||
action="store_true",
|
||
dest="output_json",
|
||
help="Output results as JSON (for integration with other tools)",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
exit_code = run_check(
|
||
heartbeat_dir=args.heartbeat_dir,
|
||
dry_run=args.dry_run,
|
||
output_json=args.output_json,
|
||
)
|
||
sys.exit(exit_code)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|