- scripts/fleet_health_probe.sh: SSH, disk, memory, process checks - infrastructure/cron/fleet-health.crontab: 5-minute cron schedule - Thresholds: disk<90%, mem<90%, critical processes monitored
84 lines
1.9 KiB
Bash
84 lines
1.9 KiB
Bash
#!/usr/bin/env bash
|
|
# fleet_health_probe.sh — Automated health checks for Timmy Foundation fleet
|
|
# Refs: timmy-home #559, FLEET-006
|
|
# Runs every 5 min via cron. Checks: SSH reachability, disk < 90%, memory < 90%, critical processes.
|
|
set -euo pipefail
|
|
|
|
LOG_DIR="/var/log/timmy"
|
|
ALERT_LOG="${LOG_DIR}/fleet_health.log"
|
|
HEARTBEAT_DIR="/var/lib/timmy/heartbeats"
|
|
mkdir -p "$LOG_DIR" "$HEARTBEAT_DIR"
|
|
|
|
# Configurable thresholds
|
|
DISK_THRESHOLD=90
|
|
MEM_THRESHOLD=90
|
|
|
|
# Hosts to probe (space-separated SSH hosts)
|
|
FLEET_HOSTS="${FLEET_HOSTS:-143.198.27.163 104.131.15.18}"
|
|
|
|
# Critical processes that must be running locally
|
|
CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"
|
|
|
|
log() {
|
|
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
|
|
}
|
|
|
|
alert() {
|
|
log "ALERT: $1"
|
|
}
|
|
|
|
ok() {
|
|
log "OK: $1"
|
|
}
|
|
|
|
status=0
|
|
|
|
# --- SSH Reachability ---
|
|
for host in $FLEET_HOSTS; do
|
|
if nc -z -w 5 "$host" 22 >/dev/null 2>&1 || timeout 5 bash -c "</dev/tcp/${host}/22" 2>/dev/null; then
|
|
ok "SSH reachable: $host"
|
|
else
|
|
alert "SSH unreachable: $host"
|
|
status=1
|
|
fi
|
|
done
|
|
|
|
# --- Disk Usage ---
|
|
disk_usage=$(df / | awk 'NR==2 {print $5}' | tr -d '%')
|
|
if [[ "$disk_usage" -lt "$DISK_THRESHOLD" ]]; then
|
|
ok "Disk usage: ${disk_usage}%"
|
|
else
|
|
alert "Disk usage critical: ${disk_usage}%"
|
|
status=1
|
|
fi
|
|
|
|
# --- Memory Usage ---
|
|
mem_usage=$(free | awk '/Mem:/ {printf("%.0f", $3/$2 * 100.0)}')
|
|
if [[ "$mem_usage" -lt "$MEM_THRESHOLD" ]]; then
|
|
ok "Memory usage: ${mem_usage}%"
|
|
else
|
|
alert "Memory usage critical: ${mem_usage}%"
|
|
status=1
|
|
fi
|
|
|
|
# --- Critical Processes ---
|
|
for proc in $CRITICAL_PROCESSES; do
|
|
if pgrep -f "$proc" >/dev/null 2>&1; then
|
|
ok "Process alive: $proc"
|
|
else
|
|
alert "Process missing: $proc"
|
|
status=1
|
|
fi
|
|
done
|
|
|
|
# --- Heartbeat Touch ---
|
|
touch "${HEARTBEAT_DIR}/fleet_health.last"
|
|
|
|
if [[ "$status" -eq 0 ]]; then
|
|
log "Fleet health probe passed."
|
|
else
|
|
log "Fleet health probe FAILED."
|
|
fi
|
|
|
|
exit "$status"
|