Compare commits

...

1 Commits

Author SHA1 Message Date
4e3f60344b feat(infra): add fleet health probe + crontab (#559, FLEET-006)
- scripts/fleet_health_probe.sh: SSH, disk, memory, process checks
- infrastructure/cron/fleet-health.crontab: 5-minute cron schedule
- Thresholds: disk<90%, mem<90%, critical processes monitored
2026-04-07 15:22:10 +00:00

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# fleet_health_probe.sh — Automated health checks for Timmy Foundation fleet
# Refs: timmy-home #559, FLEET-006
# Runs every 5 min via cron. Checks: SSH reachability, disk < 90%, memory < 90%, critical processes.
set -euo pipefail
LOG_DIR="/var/log/timmy"
ALERT_LOG="${LOG_DIR}/fleet_health.log"
HEARTBEAT_DIR="/var/lib/timmy/heartbeats"
mkdir -p "$LOG_DIR" "$HEARTBEAT_DIR"
# Configurable thresholds
DISK_THRESHOLD=90
MEM_THRESHOLD=90
# Hosts to probe (space-separated SSH hosts)
FLEET_HOSTS="${FLEET_HOSTS:-143.198.27.163 104.131.15.18}"
# Critical processes that must be running locally
CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"
log() {
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
}
alert() {
log "ALERT: $1"
}
ok() {
log "OK: $1"
}
status=0
# --- SSH Reachability ---
for host in $FLEET_HOSTS; do
if nc -z -w 5 "$host" 22 >/dev/null 2>&1 || timeout 5 bash -c "</dev/tcp/${host}/22" 2>/dev/null; then
ok "SSH reachable: $host"
else
alert "SSH unreachable: $host"
status=1
fi
done
# --- Disk Usage ---
disk_usage=$(df / | awk 'NR==2 {print $5}' | tr -d '%')
if [[ "$disk_usage" -lt "$DISK_THRESHOLD" ]]; then
ok "Disk usage: ${disk_usage}%"
else
alert "Disk usage critical: ${disk_usage}%"
status=1
fi
# --- Memory Usage ---
mem_usage=$(free | awk '/Mem:/ {printf("%.0f", $3/$2 * 100.0)}')
if [[ "$mem_usage" -lt "$MEM_THRESHOLD" ]]; then
ok "Memory usage: ${mem_usage}%"
else
alert "Memory usage critical: ${mem_usage}%"
status=1
fi
# --- Critical Processes ---
for proc in $CRITICAL_PROCESSES; do
if pgrep -f "$proc" >/dev/null 2>&1; then
ok "Process alive: $proc"
else
alert "Process missing: $proc"
status=1
fi
done
# --- Heartbeat Touch ---
touch "${HEARTBEAT_DIR}/fleet_health.last"
if [[ "$status" -eq 0 ]]; then
log "Fleet health probe passed."
else
log "Fleet health probe FAILED."
fi
exit "$status"