Compare commits
1 Commits
gemini/iss
...
bezalel/fl
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e3f60344b |
83
scripts/fleet_health_probe.sh
Normal file
83
scripts/fleet_health_probe.sh
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env bash
|
||||
# fleet_health_probe.sh — Automated health checks for Timmy Foundation fleet
|
||||
# Refs: timmy-home #559, FLEET-006
|
||||
# Runs every 5 min via cron. Checks: SSH reachability, disk < 90%, memory < 90%, critical processes.
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR="/var/log/timmy"
|
||||
ALERT_LOG="${LOG_DIR}/fleet_health.log"
|
||||
HEARTBEAT_DIR="/var/lib/timmy/heartbeats"
|
||||
mkdir -p "$LOG_DIR" "$HEARTBEAT_DIR"
|
||||
|
||||
# Configurable thresholds
|
||||
DISK_THRESHOLD=90
|
||||
MEM_THRESHOLD=90
|
||||
|
||||
# Hosts to probe (space-separated SSH hosts)
|
||||
FLEET_HOSTS="${FLEET_HOSTS:-143.198.27.163 104.131.15.18}"
|
||||
|
||||
# Critical processes that must be running locally
|
||||
CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"
|
||||
|
||||
log() {
|
||||
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
|
||||
}
|
||||
|
||||
alert() {
|
||||
log "ALERT: $1"
|
||||
}
|
||||
|
||||
ok() {
|
||||
log "OK: $1"
|
||||
}
|
||||
|
||||
status=0
|
||||
|
||||
# --- SSH Reachability ---
|
||||
for host in $FLEET_HOSTS; do
|
||||
if nc -z -w 5 "$host" 22 >/dev/null 2>&1 || timeout 5 bash -c "</dev/tcp/${host}/22" 2>/dev/null; then
|
||||
ok "SSH reachable: $host"
|
||||
else
|
||||
alert "SSH unreachable: $host"
|
||||
status=1
|
||||
fi
|
||||
done
|
||||
|
||||
# --- Disk Usage ---
|
||||
disk_usage=$(df / | awk 'NR==2 {print $5}' | tr -d '%')
|
||||
if [[ "$disk_usage" -lt "$DISK_THRESHOLD" ]]; then
|
||||
ok "Disk usage: ${disk_usage}%"
|
||||
else
|
||||
alert "Disk usage critical: ${disk_usage}%"
|
||||
status=1
|
||||
fi
|
||||
|
||||
# --- Memory Usage ---
|
||||
mem_usage=$(free | awk '/Mem:/ {printf("%.0f", $3/$2 * 100.0)}')
|
||||
if [[ "$mem_usage" -lt "$MEM_THRESHOLD" ]]; then
|
||||
ok "Memory usage: ${mem_usage}%"
|
||||
else
|
||||
alert "Memory usage critical: ${mem_usage}%"
|
||||
status=1
|
||||
fi
|
||||
|
||||
# --- Critical Processes ---
|
||||
for proc in $CRITICAL_PROCESSES; do
|
||||
if pgrep -f "$proc" >/dev/null 2>&1; then
|
||||
ok "Process alive: $proc"
|
||||
else
|
||||
alert "Process missing: $proc"
|
||||
status=1
|
||||
fi
|
||||
done
|
||||
|
||||
# --- Heartbeat Touch ---
|
||||
touch "${HEARTBEAT_DIR}/fleet_health.last"
|
||||
|
||||
if [[ "$status" -eq 0 ]]; then
|
||||
log "Fleet health probe passed."
|
||||
else
|
||||
log "Fleet health probe FAILED."
|
||||
fi
|
||||
|
||||
exit "$status"
|
||||
Reference in New Issue
Block a user