116 lines
5.0 KiB
Bash
116 lines
5.0 KiB
Bash
|
|
#!/usr/bin/env bash
|
|||
|
|
# cron-heartbeat-write.sh — Bezalel Cron Heartbeat Writer (poka-yoke #1096)
|
|||
|
|
# Refs: https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1096
|
|||
|
|
#
|
|||
|
|
# POKA-YOKE design:
|
|||
|
|
# Prevention — Cron jobs declare their identity + expected interval up front.
|
|||
|
|
# Detection — bezalel_heartbeat_check.py reads these files every 15 min and
|
|||
|
|
# alerts P1 if any job is silent for > 2× its interval.
|
|||
|
|
# Correction — Alerts fire fast enough for manual intervention or auto-restart
|
|||
|
|
# before the next scheduled run window expires.
|
|||
|
|
#
|
|||
|
|
# Usage:
|
|||
|
|
# cron-heartbeat-write.sh <job-name> [interval-seconds]
|
|||
|
|
#
|
|||
|
|
# <job-name> Unique identifier for this cron job (e.g. "morning-report")
|
|||
|
|
# [interval-seconds] Expected run interval in seconds (default: 3600)
|
|||
|
|
#
|
|||
|
|
# The heartbeat file is written to:
|
|||
|
|
# /var/run/bezalel/heartbeats/<job-name>.last
|
|||
|
|
#
|
|||
|
|
# File format (JSON):
|
|||
|
|
# {"job":"<name>","timestamp":<epoch_float>,"interval":<secs>,"pid":<pid>}
|
|||
|
|
#
|
|||
|
|
# This script ALWAYS exits 0 — it must never crash the calling cron job.
|
|||
|
|
#
|
|||
|
|
# Typical crontab usage:
|
|||
|
|
# 0 * * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh hourly-job 3600
|
|||
|
|
# 0 6 * * * /root/wizards/the-nexus/scripts/cron-heartbeat-write.sh morning-report 86400
|
|||
|
|
|
|||
|
|
set -uo pipefail
|
|||
|
|
|
|||
|
|
# ── Configuration ─────────────────────────────────────────────────────────────
|
|||
|
|
HEARTBEAT_DIR="${BEZALEL_HEARTBEAT_DIR:-/var/run/bezalel/heartbeats}"
|
|||
|
|
|
|||
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|||
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT: $*"; }
|
|||
|
|
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] HEARTBEAT WARNING: $*" >&2; }
|
|||
|
|
|
|||
|
|
# ── Input validation ──────────────────────────────────────────────────────────
|
|||
|
|
if [[ $# -lt 1 ]]; then
|
|||
|
|
warn "Usage: $0 <job-name> [interval-seconds]"
|
|||
|
|
warn "No job name provided — heartbeat not written."
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
JOB_NAME="$1"
|
|||
|
|
INTERVAL_SECS="${2:-3600}"
|
|||
|
|
|
|||
|
|
# Sanitize job name to prevent path traversal / weird filenames
|
|||
|
|
# Allow alphanumeric, dash, underscore, dot only
|
|||
|
|
SAFE_JOB_NAME="${JOB_NAME//[^a-zA-Z0-9_.-]/}"
|
|||
|
|
if [[ -z "$SAFE_JOB_NAME" ]]; then
|
|||
|
|
warn "Job name '${JOB_NAME}' contains only unsafe characters — heartbeat not written."
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
if [[ "$SAFE_JOB_NAME" != "$JOB_NAME" ]]; then
|
|||
|
|
warn "Job name sanitized: '${JOB_NAME}' → '${SAFE_JOB_NAME}'"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# Validate interval is a positive integer
|
|||
|
|
if ! [[ "$INTERVAL_SECS" =~ ^[0-9]+$ ]] || (( INTERVAL_SECS < 1 )); then
|
|||
|
|
warn "Invalid interval '${INTERVAL_SECS}' — using default 3600."
|
|||
|
|
INTERVAL_SECS=3600
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Create heartbeat directory ────────────────────────────────────────────────
|
|||
|
|
if ! mkdir -p "$HEARTBEAT_DIR" 2>/dev/null; then
|
|||
|
|
warn "Cannot create heartbeat dir '${HEARTBEAT_DIR}' — heartbeat not written."
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Build JSON payload ────────────────────────────────────────────────────────
|
|||
|
|
# Use python3 for reliable epoch float and JSON encoding.
|
|||
|
|
# Falls back to date-based approach if python3 unavailable.
|
|||
|
|
TIMESTAMP=$(python3 -c "import time; print(time.time())" 2>/dev/null \
|
|||
|
|
|| date +%s)
|
|||
|
|
|
|||
|
|
CURRENT_PID=$$
|
|||
|
|
|
|||
|
|
PAYLOAD=$(python3 -c "
|
|||
|
|
import json, sys
|
|||
|
|
print(json.dumps({
|
|||
|
|
'job': sys.argv[1],
|
|||
|
|
'timestamp': float(sys.argv[2]),
|
|||
|
|
'interval': int(sys.argv[3]),
|
|||
|
|
'pid': int(sys.argv[4]),
|
|||
|
|
}))
|
|||
|
|
" "$SAFE_JOB_NAME" "$TIMESTAMP" "$INTERVAL_SECS" "$CURRENT_PID" 2>/dev/null)
|
|||
|
|
|
|||
|
|
if [[ -z "$PAYLOAD" ]]; then
|
|||
|
|
# Minimal fallback if python3 fails
|
|||
|
|
PAYLOAD="{\"job\":\"${SAFE_JOB_NAME}\",\"timestamp\":${TIMESTAMP},\"interval\":${INTERVAL_SECS},\"pid\":${CURRENT_PID}}"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Atomic write via temp + rename ────────────────────────────────────────────
|
|||
|
|
# Writes to a temp file first then renames, so bezalel_heartbeat_check.py
|
|||
|
|
# never sees a partial file mid-write. This is the poka-yoke atomic guarantee.
|
|||
|
|
TARGET_FILE="${HEARTBEAT_DIR}/${SAFE_JOB_NAME}.last"
|
|||
|
|
TMP_FILE="${HEARTBEAT_DIR}/.${SAFE_JOB_NAME}.last.tmp.$$"
|
|||
|
|
|
|||
|
|
if printf '%s\n' "$PAYLOAD" > "$TMP_FILE" 2>/dev/null; then
|
|||
|
|
if mv "$TMP_FILE" "$TARGET_FILE" 2>/dev/null; then
|
|||
|
|
log "Heartbeat written: ${TARGET_FILE} (job=${SAFE_JOB_NAME}, interval=${INTERVAL_SECS}s)"
|
|||
|
|
else
|
|||
|
|
warn "mv failed for '${TMP_FILE}' → '${TARGET_FILE}' — heartbeat not committed."
|
|||
|
|
rm -f "$TMP_FILE" 2>/dev/null || true
|
|||
|
|
fi
|
|||
|
|
else
|
|||
|
|
warn "Write to temp file '${TMP_FILE}' failed — heartbeat not written."
|
|||
|
|
rm -f "$TMP_FILE" 2>/dev/null || true
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# Always exit 0 — never crash the calling cron job.
|
|||
|
|
exit 0
|