[claude] Poka-yoke cron heartbeats: write, check, and report (#1096) (#1107)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled

This commit was merged in pull request #1107.
This commit is contained in:
2026-04-07 14:44:05 +00:00
parent ea3cc6b393
commit 34ec13bc29
6 changed files with 843 additions and 1 deletions

115
scripts/cron-heartbeat-write.sh Executable file
View File

@@ -0,0 +1,115 @@
#!/usr/bin/env bash
# cron-heartbeat-write.sh — Bezalel Cron Heartbeat Writer (poka-yoke #1096)
# Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
#
# POKA-YOKE design:
# Prevention — Cron jobs declare their identity + expected interval up front.
# Detection — bezalel_heartbeat_check.py reads these files every 15 min and
# alerts P1 if any job is silent for > 2× its interval.
# Correction — Alerts fire fast enough for manual intervention or auto-restart
# before the next scheduled run window expires.
#
# Usage:
# cron-heartbeat-write.sh <job-name> [interval-seconds]
#
# <job-name> Unique identifier for this cron job (e.g. "morning-report")
# [interval-seconds] Expected run interval in seconds (default: 3600)
#
# The heartbeat file is written to:
# /var/run/bezalel/heartbeats/<job-name>.last
#
# File format (JSON):
# {"job":"<name>","timestamp":<epoch_float>,"interval":<secs>,"pid":<pid>}
#
# This script ALWAYS exits 0 — it must never crash the calling cron job.
#
# Typical crontab usage:
# 0 * * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh hourly-job 3600
# 0 6 * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh morning-report 86400
set -uo pipefail
# ── Configuration ─────────────────────────────────────────────────────────────
HEARTBEAT_DIR="${BEZALEL_HEARTBEAT_DIR:-/var/run/bezalel/heartbeats}"
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT: $*"; }
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT WARNING: $*" >&2; }
# ── Input validation ──────────────────────────────────────────────────────────
if [[ $# -lt 1 ]]; then
warn "Usage: $0 <job-name> [interval-seconds]"
warn "No job name provided — heartbeat not written."
exit 0
fi
JOB_NAME="$1"
INTERVAL_SECS="${2:-3600}"
# Sanitize job name to prevent path traversal / weird filenames
# Allow alphanumeric, dash, underscore, dot only
SAFE_JOB_NAME="${JOB_NAME//[^a-zA-Z0-9_.-]/}"
if [[ -z "$SAFE_JOB_NAME" ]]; then
warn "Job name '${JOB_NAME}' contains only unsafe characters — heartbeat not written."
exit 0
fi
if [[ "$SAFE_JOB_NAME" != "$JOB_NAME" ]]; then
warn "Job name sanitized: '${JOB_NAME}' → '${SAFE_JOB_NAME}'"
fi
# Validate interval is a positive integer
if ! [[ "$INTERVAL_SECS" =~ ^[0-9]+$ ]] || (( INTERVAL_SECS < 1 )); then
warn "Invalid interval '${INTERVAL_SECS}' — using default 3600."
INTERVAL_SECS=3600
fi
# ── Create heartbeat directory ────────────────────────────────────────────────
if ! mkdir -p "$HEARTBEAT_DIR" 2>/dev/null; then
warn "Cannot create heartbeat dir '${HEARTBEAT_DIR}' — heartbeat not written."
exit 0
fi
# ── Build JSON payload ────────────────────────────────────────────────────────
# Use python3 for reliable epoch float and JSON encoding.
# Falls back to date-based approach if python3 unavailable.
TIMESTAMP=$(python3 -c "import time; print(time.time())" 2>/dev/null \
|| date +%s)
CURRENT_PID=$$
PAYLOAD=$(python3 -c "
import json, sys
print(json.dumps({
'job': sys.argv[1],
'timestamp': float(sys.argv[2]),
'interval': int(sys.argv[3]),
'pid': int(sys.argv[4]),
}))
" "$SAFE_JOB_NAME" "$TIMESTAMP" "$INTERVAL_SECS" "$CURRENT_PID" 2>/dev/null)
if [[ -z "$PAYLOAD" ]]; then
# Minimal fallback if python3 fails
PAYLOAD="{\"job\":\"${SAFE_JOB_NAME}\",\"timestamp\":${TIMESTAMP},\"interval\":${INTERVAL_SECS},\"pid\":${CURRENT_PID}}"
fi
# ── Atomic write via temp + rename ────────────────────────────────────────────
# Writes to a temp file first then renames, so bezalel_heartbeat_check.py
# never sees a partial file mid-write. This is the poka-yoke atomic guarantee.
TARGET_FILE="${HEARTBEAT_DIR}/${SAFE_JOB_NAME}.last"
TMP_FILE="${HEARTBEAT_DIR}/.${SAFE_JOB_NAME}.last.tmp.$$"
if printf '%s\n' "$PAYLOAD" > "$TMP_FILE" 2>/dev/null; then
if mv "$TMP_FILE" "$TARGET_FILE" 2>/dev/null; then
log "Heartbeat written: ${TARGET_FILE} (job=${SAFE_JOB_NAME}, interval=${INTERVAL_SECS}s)"
else
warn "mv failed for '${TMP_FILE}' → '${TARGET_FILE}' — heartbeat not committed."
rm -f "$TMP_FILE" 2>/dev/null || true
fi
else
warn "Write to temp file '${TMP_FILE}' failed — heartbeat not written."
rm -f "$TMP_FILE" 2>/dev/null || true
fi
# Always exit 0 — never crash the calling cron job.
exit 0

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Bezalel Meta-Heartbeat — stale cron detection (poka-yoke #1096)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
After=network.target
[Service]
Type=oneshot
ExecStart=/root/wizards/the-nexus/bin/bezalel_heartbeat_check.py
StandardOutput=journal
StandardError=journal
Environment=HOME=/root

View File

@@ -0,0 +1,11 @@
[Unit]
Description=Bezalel Meta-Heartbeat — fires every 15 minutes (poka-yoke #1096)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
[Timer]
OnBootSec=5min
OnUnitActiveSec=15min
Persistent=true
[Install]
WantedBy=timers.target