#!/usr/bin/env bash # Meta-heartbeat — checks all Bezalel cron jobs for stale timestamps set -euo pipefail HEARTBEAT_DIR="/var/lib/bezalel/heartbeats" ALERT_LOG="/var/log/bezalel_meta_heartbeat.log" STALE_MINUTES=30 log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG" } mkdir -p "$HEARTBEAT_DIR" # Define expected heartbeats: name => max_stale_minutes HEARTBEATS=( "nightly_watch:150" # 2.5h — runs at 02:00 "mempalace_nightly:150" # 2.5h — runs at 03:00 "db_backup:150" # 2.5h — runs at 03:30 "runner_health:15" # 15m — every 5 min ) NOW_EPOCH=$(date +%s) FAILURES=0 for entry in "${HEARTBEATS[@]}"; do name="${entry%%:*}" max_minutes="${entry##*:}" file="${HEARTBEAT_DIR}/${name}.last" if [[ ! -f "$file" ]]; then log "MISSING: $name heartbeat file not found ($file)" FAILURES=$((FAILURES + 1)) continue fi LAST_EPOCH=$(stat -c %Y "$file") AGE_MIN=$(( (NOW_EPOCH - LAST_EPOCH) / 60 )) if [[ $AGE_MIN -gt $max_minutes ]]; then log "STALE: $name is ${AGE_MIN}m old (max ${max_minutes}m)" FAILURES=$((FAILURES + 1)) else log "OK: $name is ${AGE_MIN}m old" fi done if [[ $FAILURES -gt 0 ]]; then log "ALERT: $FAILURES stale/missing heartbeat(s) detected." exit 1 else log "ALL_OK: All heartbeats healthy." fi