54 lines
1.3 KiB
Bash
Executable File
54 lines
1.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Meta-heartbeat — checks all Bezalel cron jobs for stale timestamps
|
|
set -euo pipefail
|
|
|
|
HEARTBEAT_DIR="/var/lib/bezalel/heartbeats"
|
|
ALERT_LOG="/var/log/bezalel_meta_heartbeat.log"
|
|
STALE_MINUTES=30
|
|
|
|
log() {
|
|
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
|
|
}
|
|
|
|
mkdir -p "$HEARTBEAT_DIR"
|
|
|
|
# Define expected heartbeats: name => max_stale_minutes
|
|
HEARTBEATS=(
|
|
"nightly_watch:150" # 2.5h — runs at 02:00
|
|
"mempalace_nightly:150" # 2.5h — runs at 03:00
|
|
"db_backup:150" # 2.5h — runs at 03:30
|
|
"runner_health:15" # 15m — every 5 min
|
|
)
|
|
|
|
NOW_EPOCH=$(date +%s)
|
|
FAILURES=0
|
|
|
|
for entry in "${HEARTBEATS[@]}"; do
|
|
name="${entry%%:*}"
|
|
max_minutes="${entry##*:}"
|
|
file="${HEARTBEAT_DIR}/${name}.last"
|
|
|
|
if [[ ! -f "$file" ]]; then
|
|
log "MISSING: $name heartbeat file not found ($file)"
|
|
FAILURES=$((FAILURES + 1))
|
|
continue
|
|
fi
|
|
|
|
LAST_EPOCH=$(stat -c %Y "$file")
|
|
AGE_MIN=$(( (NOW_EPOCH - LAST_EPOCH) / 60 ))
|
|
|
|
if [[ $AGE_MIN -gt $max_minutes ]]; then
|
|
log "STALE: $name is ${AGE_MIN}m old (max ${max_minutes}m)"
|
|
FAILURES=$((FAILURES + 1))
|
|
else
|
|
log "OK: $name is ${AGE_MIN}m old"
|
|
fi
|
|
done
|
|
|
|
if [[ $FAILURES -gt 0 ]]; then
|
|
log "ALERT: $FAILURES stale/missing heartbeat(s) detected."
|
|
exit 1
|
|
else
|
|
log "ALL_OK: All heartbeats healthy."
|
|
fi
|