[claude] Poka-yoke cron heartbeats: write, check, and report (#1096) (#1107)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit was merged in pull request #1107.
This commit is contained in:
115
scripts/cron-heartbeat-write.sh
Executable file
115
scripts/cron-heartbeat-write.sh
Executable file
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env bash
|
||||
# cron-heartbeat-write.sh — Bezalel Cron Heartbeat Writer (poka-yoke #1096)
|
||||
# Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
||||
#
|
||||
# POKA-YOKE design:
|
||||
# Prevention — Cron jobs declare their identity + expected interval up front.
|
||||
# Detection — bezalel_heartbeat_check.py reads these files every 15 min and
|
||||
# alerts P1 if any job is silent for > 2× its interval.
|
||||
# Correction — Alerts fire fast enough for manual intervention or auto-restart
|
||||
# before the next scheduled run window expires.
|
||||
#
|
||||
# Usage:
|
||||
# cron-heartbeat-write.sh <job-name> [interval-seconds]
|
||||
#
|
||||
# <job-name> Unique identifier for this cron job (e.g. "morning-report")
|
||||
# [interval-seconds] Expected run interval in seconds (default: 3600)
|
||||
#
|
||||
# The heartbeat file is written to:
|
||||
# /var/run/bezalel/heartbeats/<job-name>.last
|
||||
#
|
||||
# File format (JSON):
|
||||
# {"job":"<name>","timestamp":<epoch_float>,"interval":<secs>,"pid":<pid>}
|
||||
#
|
||||
# This script ALWAYS exits 0 — it must never crash the calling cron job.
|
||||
#
|
||||
# Typical crontab usage:
|
||||
# 0 * * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh hourly-job 3600
|
||||
# 0 6 * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh morning-report 86400
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
# ── Configuration ─────────────────────────────────────────────────────────────
|
||||
HEARTBEAT_DIR="${BEZALEL_HEARTBEAT_DIR:-/var/run/bezalel/heartbeats}"
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT: $*"; }
|
||||
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT WARNING: $*" >&2; }
|
||||
|
||||
# ── Input validation ──────────────────────────────────────────────────────────
|
||||
if [[ $# -lt 1 ]]; then
|
||||
warn "Usage: $0 <job-name> [interval-seconds]"
|
||||
warn "No job name provided — heartbeat not written."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
JOB_NAME="$1"
|
||||
INTERVAL_SECS="${2:-3600}"
|
||||
|
||||
# Sanitize job name to prevent path traversal / weird filenames
|
||||
# Allow alphanumeric, dash, underscore, dot only
|
||||
SAFE_JOB_NAME="${JOB_NAME//[^a-zA-Z0-9_.-]/}"
|
||||
if [[ -z "$SAFE_JOB_NAME" ]]; then
|
||||
warn "Job name '${JOB_NAME}' contains only unsafe characters — heartbeat not written."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "$SAFE_JOB_NAME" != "$JOB_NAME" ]]; then
|
||||
warn "Job name sanitized: '${JOB_NAME}' → '${SAFE_JOB_NAME}'"
|
||||
fi
|
||||
|
||||
# Validate interval is a positive integer
|
||||
if ! [[ "$INTERVAL_SECS" =~ ^[0-9]+$ ]] || (( INTERVAL_SECS < 1 )); then
|
||||
warn "Invalid interval '${INTERVAL_SECS}' — using default 3600."
|
||||
INTERVAL_SECS=3600
|
||||
fi
|
||||
|
||||
# ── Create heartbeat directory ────────────────────────────────────────────────
|
||||
if ! mkdir -p "$HEARTBEAT_DIR" 2>/dev/null; then
|
||||
warn "Cannot create heartbeat dir '${HEARTBEAT_DIR}' — heartbeat not written."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Build JSON payload ────────────────────────────────────────────────────────
|
||||
# Use python3 for reliable epoch float and JSON encoding.
|
||||
# Falls back to date-based approach if python3 unavailable.
|
||||
TIMESTAMP=$(python3 -c "import time; print(time.time())" 2>/dev/null \
|
||||
|| date +%s)
|
||||
|
||||
CURRENT_PID=$$
|
||||
|
||||
PAYLOAD=$(python3 -c "
|
||||
import json, sys
|
||||
print(json.dumps({
|
||||
'job': sys.argv[1],
|
||||
'timestamp': float(sys.argv[2]),
|
||||
'interval': int(sys.argv[3]),
|
||||
'pid': int(sys.argv[4]),
|
||||
}))
|
||||
" "$SAFE_JOB_NAME" "$TIMESTAMP" "$INTERVAL_SECS" "$CURRENT_PID" 2>/dev/null)
|
||||
|
||||
if [[ -z "$PAYLOAD" ]]; then
|
||||
# Minimal fallback if python3 fails
|
||||
PAYLOAD="{\"job\":\"${SAFE_JOB_NAME}\",\"timestamp\":${TIMESTAMP},\"interval\":${INTERVAL_SECS},\"pid\":${CURRENT_PID}}"
|
||||
fi
|
||||
|
||||
# ── Atomic write via temp + rename ────────────────────────────────────────────
|
||||
# Writes to a temp file first then renames, so bezalel_heartbeat_check.py
|
||||
# never sees a partial file mid-write. This is the poka-yoke atomic guarantee.
|
||||
TARGET_FILE="${HEARTBEAT_DIR}/${SAFE_JOB_NAME}.last"
|
||||
TMP_FILE="${HEARTBEAT_DIR}/.${SAFE_JOB_NAME}.last.tmp.$$"
|
||||
|
||||
if printf '%s\n' "$PAYLOAD" > "$TMP_FILE" 2>/dev/null; then
|
||||
if mv "$TMP_FILE" "$TARGET_FILE" 2>/dev/null; then
|
||||
log "Heartbeat written: ${TARGET_FILE} (job=${SAFE_JOB_NAME}, interval=${INTERVAL_SECS}s)"
|
||||
else
|
||||
warn "mv failed for '${TMP_FILE}' → '${TARGET_FILE}' — heartbeat not committed."
|
||||
rm -f "$TMP_FILE" 2>/dev/null || true
|
||||
fi
|
||||
else
|
||||
warn "Write to temp file '${TMP_FILE}' failed — heartbeat not written."
|
||||
rm -f "$TMP_FILE" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Always exit 0 — never crash the calling cron job.
|
||||
exit 0
|
||||
11
scripts/systemd/bezalel-meta-heartbeat.service
Normal file
11
scripts/systemd/bezalel-meta-heartbeat.service
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Bezalel Meta-Heartbeat — stale cron detection (poka-yoke #1096)
|
||||
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/root/wizards/the-nexus/bin/bezalel_heartbeat_check.py
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
Environment=HOME=/root
|
||||
11
scripts/systemd/bezalel-meta-heartbeat.timer
Normal file
11
scripts/systemd/bezalel-meta-heartbeat.timer
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Bezalel Meta-Heartbeat — fires every 15 minutes (poka-yoke #1096)
|
||||
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=15min
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
Reference in New Issue
Block a user