327 lines
10 KiB
Python
327 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Bezalel Meta-Heartbeat Checker — stale cron detection (poka-yoke #1096)
|
|||
|
|
|
|||
|
|
Monitors all cron job heartbeat files and alerts P1 when any job has been
|
|||
|
|
silent for more than 2× its declared interval.
|
|||
|
|
|
|||
|
|
POKA-YOKE design:
|
|||
|
|
Prevention — cron-heartbeat-write.sh writes a .last file atomically after
|
|||
|
|
every successful cron job completion, stamping its interval.
|
|||
|
|
Detection — this script runs every 15 minutes (via systemd timer) and
|
|||
|
|
raises P1 on stderr + writes an alert file for any stale job.
|
|||
|
|
Correction — alerts are loud enough (P1 stderr + alert files) for
|
|||
|
|
monitoring/humans to intervene before the next run window.
|
|||
|
|
|
|||
|
|
ZERO DEPENDENCIES
|
|||
|
|
=================
|
|||
|
|
Pure stdlib. No pip installs.
|
|||
|
|
|
|||
|
|
USAGE
|
|||
|
|
=====
|
|||
|
|
# One-shot check (default dir)
|
|||
|
|
python bin/bezalel_heartbeat_check.py
|
|||
|
|
|
|||
|
|
# Override heartbeat dir
|
|||
|
|
python bin/bezalel_heartbeat_check.py --heartbeat-dir /tmp/test-beats
|
|||
|
|
|
|||
|
|
# Dry-run (check + report, don't write alert files)
|
|||
|
|
python bin/bezalel_heartbeat_check.py --dry-run
|
|||
|
|
|
|||
|
|
# JSON output (for piping into other tools)
|
|||
|
|
python bin/bezalel_heartbeat_check.py --json
|
|||
|
|
|
|||
|
|
EXIT CODES
|
|||
|
|
==========
|
|||
|
|
0 — all jobs healthy (or no .last files found yet)
|
|||
|
|
1 — one or more stale beats detected
|
|||
|
|
2 — heartbeat dir unreadable
|
|||
|
|
|
|||
|
|
IMPORTABLE API
|
|||
|
|
==============
|
|||
|
|
from bin.bezalel_heartbeat_check import check_cron_heartbeats
|
|||
|
|
|
|||
|
|
result = check_cron_heartbeats("/var/run/bezalel/heartbeats")
|
|||
|
|
# Returns dict with keys: checked_at, jobs, stale_count, healthy_count
|
|||
|
|
|
|||
|
|
Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import time
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Any, Dict, List, Optional
|
|||
|
|
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format="%(asctime)s %(levelname)-7s %(message)s",
|
|||
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger("bezalel.heartbeat")
|
|||
|
|
|
|||
|
|
# ── Configuration ────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
DEFAULT_HEARTBEAT_DIR = "/var/run/bezalel/heartbeats"
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Core checker ─────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def check_cron_heartbeats(heartbeat_dir: str = DEFAULT_HEARTBEAT_DIR) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
Scan all .last files in heartbeat_dir and determine which jobs are stale.
|
|||
|
|
|
|||
|
|
Returns a dict:
|
|||
|
|
{
|
|||
|
|
"checked_at": "<ISO 8601 timestamp>",
|
|||
|
|
"jobs": [
|
|||
|
|
{
|
|||
|
|
"job": str,
|
|||
|
|
"healthy": bool,
|
|||
|
|
"age_secs": float,
|
|||
|
|
"interval": int,
|
|||
|
|
"last_seen": str or None, # ISO timestamp of last heartbeat
|
|||
|
|
"message": str,
|
|||
|
|
},
|
|||
|
|
...
|
|||
|
|
],
|
|||
|
|
"stale_count": int,
|
|||
|
|
"healthy_count": int,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
On empty dir (no .last files), returns jobs=[] with stale_count=0.
|
|||
|
|
On corrupt .last file, reports that job as stale with an error message.
|
|||
|
|
|
|||
|
|
Refs: #1096
|
|||
|
|
"""
|
|||
|
|
now_ts = time.time()
|
|||
|
|
checked_at = datetime.fromtimestamp(now_ts, tz=timezone.utc).isoformat()
|
|||
|
|
|
|||
|
|
hb_path = Path(heartbeat_dir)
|
|||
|
|
jobs: List[Dict[str, Any]] = []
|
|||
|
|
|
|||
|
|
if not hb_path.exists():
|
|||
|
|
return {
|
|||
|
|
"checked_at": checked_at,
|
|||
|
|
"jobs": [],
|
|||
|
|
"stale_count": 0,
|
|||
|
|
"healthy_count": 0,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
last_files = sorted(hb_path.glob("*.last"))
|
|||
|
|
|
|||
|
|
for last_file in last_files:
|
|||
|
|
job_name = last_file.stem # filename without .last extension
|
|||
|
|
|
|||
|
|
# Read and parse the heartbeat file
|
|||
|
|
try:
|
|||
|
|
raw = last_file.read_text(encoding="utf-8")
|
|||
|
|
data = json.loads(raw)
|
|||
|
|
except (OSError, json.JSONDecodeError) as exc:
|
|||
|
|
jobs.append({
|
|||
|
|
"job": job_name,
|
|||
|
|
"healthy": False,
|
|||
|
|
"age_secs": float("inf"),
|
|||
|
|
"interval": 3600,
|
|||
|
|
"last_seen": None,
|
|||
|
|
"message": f"CORRUPT: cannot read/parse heartbeat file: {exc}",
|
|||
|
|
})
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Extract fields with safe defaults
|
|||
|
|
beat_timestamp = float(data.get("timestamp", 0))
|
|||
|
|
interval = int(data.get("interval", 3600))
|
|||
|
|
pid = data.get("pid", "?")
|
|||
|
|
|
|||
|
|
age_secs = now_ts - beat_timestamp
|
|||
|
|
|
|||
|
|
# Convert beat_timestamp to a readable ISO string
|
|||
|
|
try:
|
|||
|
|
last_seen = datetime.fromtimestamp(beat_timestamp, tz=timezone.utc).isoformat()
|
|||
|
|
except (OSError, OverflowError, ValueError):
|
|||
|
|
last_seen = None
|
|||
|
|
|
|||
|
|
# Stale = silent for more than 2× the declared interval
|
|||
|
|
threshold = 2 * interval
|
|||
|
|
is_stale = age_secs > threshold
|
|||
|
|
|
|||
|
|
if is_stale:
|
|||
|
|
message = (
|
|||
|
|
f"STALE (last {age_secs:.0f}s ago, interval {interval}s"
|
|||
|
|
f" — exceeds 2x threshold of {threshold}s)"
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
message = f"OK (last {age_secs:.0f}s ago, interval {interval}s)"
|
|||
|
|
|
|||
|
|
jobs.append({
|
|||
|
|
"job": job_name,
|
|||
|
|
"healthy": not is_stale,
|
|||
|
|
"age_secs": age_secs,
|
|||
|
|
"interval": interval,
|
|||
|
|
"last_seen": last_seen,
|
|||
|
|
"message": message,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
stale_count = sum(1 for j in jobs if not j["healthy"])
|
|||
|
|
healthy_count = sum(1 for j in jobs if j["healthy"])
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"checked_at": checked_at,
|
|||
|
|
"jobs": jobs,
|
|||
|
|
"stale_count": stale_count,
|
|||
|
|
"healthy_count": healthy_count,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Alert file writer ────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def write_alert(heartbeat_dir: str, job_info: Dict[str, Any]) -> None:
|
|||
|
|
"""
|
|||
|
|
Write an alert file for a stale job to <heartbeat_dir>/alerts/<job>.alert
|
|||
|
|
|
|||
|
|
Alert files are watched by external monitoring. They persist until the
|
|||
|
|
job runs again and clears stale status on the next check cycle.
|
|||
|
|
|
|||
|
|
Refs: #1096
|
|||
|
|
"""
|
|||
|
|
alerts_dir = Path(heartbeat_dir) / "alerts"
|
|||
|
|
try:
|
|||
|
|
alerts_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
except OSError as exc:
|
|||
|
|
logger.warning("Cannot create alerts dir %s: %s", alerts_dir, exc)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
alert_file = alerts_dir / f"{job_info['job']}.alert"
|
|||
|
|
now_str = datetime.now(tz=timezone.utc).isoformat()
|
|||
|
|
|
|||
|
|
content = {
|
|||
|
|
"alert_level": "P1",
|
|||
|
|
"job": job_info["job"],
|
|||
|
|
"message": job_info["message"],
|
|||
|
|
"age_secs": job_info["age_secs"],
|
|||
|
|
"interval": job_info["interval"],
|
|||
|
|
"last_seen": job_info["last_seen"],
|
|||
|
|
"detected_at": now_str,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Atomic write via temp + rename (same poka-yoke pattern as the writer)
|
|||
|
|
tmp_file = alert_file.with_suffix(f".alert.tmp.{os.getpid()}")
|
|||
|
|
try:
|
|||
|
|
tmp_file.write_text(json.dumps(content, indent=2), encoding="utf-8")
|
|||
|
|
tmp_file.rename(alert_file)
|
|||
|
|
except OSError as exc:
|
|||
|
|
logger.warning("Failed to write alert file %s: %s", alert_file, exc)
|
|||
|
|
tmp_file.unlink(missing_ok=True)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Main runner ──────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def run_check(heartbeat_dir: str, dry_run: bool = False, output_json: bool = False) -> int:
|
|||
|
|
"""
|
|||
|
|
Run a full heartbeat check cycle. Returns exit code (0/1/2).
|
|||
|
|
|
|||
|
|
Exit codes:
|
|||
|
|
0 — all healthy (or no .last files found yet)
|
|||
|
|
1 — stale beats detected
|
|||
|
|
2 — heartbeat dir unreadable (permissions, etc.)
|
|||
|
|
|
|||
|
|
Refs: #1096
|
|||
|
|
"""
|
|||
|
|
hb_path = Path(heartbeat_dir)
|
|||
|
|
|
|||
|
|
# Check if dir exists but is unreadable (permissions)
|
|||
|
|
if hb_path.exists() and not os.access(heartbeat_dir, os.R_OK):
|
|||
|
|
logger.error("Heartbeat dir unreadable: %s", heartbeat_dir)
|
|||
|
|
return 2
|
|||
|
|
|
|||
|
|
result = check_cron_heartbeats(heartbeat_dir)
|
|||
|
|
|
|||
|
|
if output_json:
|
|||
|
|
print(json.dumps(result, indent=2))
|
|||
|
|
return 1 if result["stale_count"] > 0 else 0
|
|||
|
|
|
|||
|
|
# Human-readable output
|
|||
|
|
if not result["jobs"]:
|
|||
|
|
logger.warning(
|
|||
|
|
"No .last files found in %s — bezalel not yet provisioned or no jobs registered.",
|
|||
|
|
heartbeat_dir,
|
|||
|
|
)
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
for job in result["jobs"]:
|
|||
|
|
if job["healthy"]:
|
|||
|
|
logger.info(" + %s: %s", job["job"], job["message"])
|
|||
|
|
else:
|
|||
|
|
logger.error(" - %s: %s", job["job"], job["message"])
|
|||
|
|
|
|||
|
|
if result["stale_count"] > 0:
|
|||
|
|
for job in result["jobs"]:
|
|||
|
|
if not job["healthy"]:
|
|||
|
|
# P1 alert to stderr
|
|||
|
|
print(
|
|||
|
|
f"[P1-ALERT] STALE CRON JOB: {job['job']} — {job['message']}",
|
|||
|
|
file=sys.stderr,
|
|||
|
|
)
|
|||
|
|
if not dry_run:
|
|||
|
|
write_alert(heartbeat_dir, job)
|
|||
|
|
else:
|
|||
|
|
logger.info("DRY RUN — would write alert for stale job: %s", job["job"])
|
|||
|
|
|
|||
|
|
logger.error(
|
|||
|
|
"Heartbeat check FAILED: %d stale, %d healthy",
|
|||
|
|
result["stale_count"],
|
|||
|
|
result["healthy_count"],
|
|||
|
|
)
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
"Heartbeat check PASSED: %d healthy, %d stale",
|
|||
|
|
result["healthy_count"],
|
|||
|
|
result["stale_count"],
|
|||
|
|
)
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── CLI entrypoint ───────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def main() -> None:
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description=(
|
|||
|
|
"Bezalel Meta-Heartbeat Checker — detect silent cron failures (poka-yoke #1096)"
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--heartbeat-dir",
|
|||
|
|
default=DEFAULT_HEARTBEAT_DIR,
|
|||
|
|
help=f"Directory containing .last heartbeat files (default: {DEFAULT_HEARTBEAT_DIR})",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--dry-run",
|
|||
|
|
action="store_true",
|
|||
|
|
help="Check and report but do not write alert files",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--json",
|
|||
|
|
action="store_true",
|
|||
|
|
dest="output_json",
|
|||
|
|
help="Output results as JSON (for integration with other tools)",
|
|||
|
|
)
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
exit_code = run_check(
|
|||
|
|
heartbeat_dir=args.heartbeat_dir,
|
|||
|
|
dry_run=args.dry_run,
|
|||
|
|
output_json=args.output_json,
|
|||
|
|
)
|
|||
|
|
sys.exit(exit_code)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|