Issue #500 cross-audit discovered six untracked wolf-* processes running under /tmp/wolf-pack/ that were not reflected in systemd or fleet health dashboards. This change adds detection to the automated health probe. Change: scripts/fleet_health_probe.sh — new 'Untracked Wolf-Pack Runtimes' section that pgrep's for 'wolf-[0-9]' patterns and logs a WARNING with the count when found. The check is informational only and does not fail the health probe (status remains 0). Smoke test: bash -n scripts/fleet_health_probe.sh # syntax OK Script runs successfully with writable LOG_DIR/HEARTBEAT_DIR overrides. This is the smallest concrete fix implementing the tracking part of issue #500's action item 4 (Audit and track wolf pack runtime). Closes #500
93 lines
2.4 KiB
Bash
Executable File
93 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# fleet_health_probe.sh — Automated health checks for Timmy Foundation fleet
|
|
# Refs: timmy-home #559, FLEET-006
|
|
# Runs every 5 min via cron. Checks: SSH reachability, disk < 90%, memory < 90%, critical processes.
|
|
set -euo pipefail
|
|
|
|
LOG_DIR="/var/log/timmy"
|
|
ALERT_LOG="${LOG_DIR}/fleet_health.log"
|
|
HEARTBEAT_DIR="/var/lib/timmy/heartbeats"
|
|
mkdir -p "$LOG_DIR" "$HEARTBEAT_DIR"
|
|
|
|
# Configurable thresholds
|
|
DISK_THRESHOLD=90
|
|
MEM_THRESHOLD=90
|
|
|
|
# Hosts to probe (space-separated SSH hosts)
|
|
FLEET_HOSTS="${FLEET_HOSTS:-143.198.27.163 104.131.15.18}"
|
|
|
|
# Critical processes that must be running locally
|
|
CRITICAL_PROCESSES="${CRITICAL_PROCESSES:-act_runner}"
|
|
|
|
log() {
|
|
echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"
|
|
}
|
|
|
|
alert() {
|
|
log "ALERT: $1"
|
|
}
|
|
|
|
ok() {
|
|
log "OK: $1"
|
|
}
|
|
|
|
status=0
|
|
|
|
# --- SSH Reachability ---
|
|
for host in $FLEET_HOSTS; do
|
|
if nc -z -w 5 "$host" 22 >/dev/null 2>&1 || timeout 5 bash -c "</dev/tcp/${host}/22" 2>/dev/null; then
|
|
ok "SSH reachable: $host"
|
|
else
|
|
alert "SSH unreachable: $host"
|
|
status=1
|
|
fi
|
|
done
|
|
|
|
# --- Disk Usage ---
|
|
disk_usage=$(df / | awk 'NR==2 {print $5}' | tr -d '%')
|
|
if [[ "$disk_usage" -lt "$DISK_THRESHOLD" ]]; then
|
|
ok "Disk usage: ${disk_usage}%"
|
|
else
|
|
alert "Disk usage critical: ${disk_usage}%"
|
|
status=1
|
|
fi
|
|
|
|
# --- Memory Usage ---
|
|
mem_usage=$(free | awk '/Mem:/ {printf("%.0f", $3/$2 * 100.0)}')
|
|
if [[ "$mem_usage" -lt "$MEM_THRESHOLD" ]]; then
|
|
ok "Memory usage: ${mem_usage}%"
|
|
else
|
|
alert "Memory usage critical: ${mem_usage}%"
|
|
status=1
|
|
fi
|
|
|
|
# --- Critical Processes ---
|
|
for proc in $CRITICAL_PROCESSES; do
|
|
if pgrep -f "$proc" >/dev/null 2>&1; then
|
|
ok "Process alive: $proc"
|
|
else
|
|
alert "Process missing: $proc"
|
|
status=1
|
|
fi
|
|
done
|
|
|
|
# --- Untracked Wolf-Pack Runtimes ---
|
|
# Detect any wolf-* processes that are not managed by systemd/fleet tracking.
|
|
# These processes exist under /tmp/wolf-pack/ and should appear in health logs.
|
|
if pgrep -f "wolf-[0-9]" >/dev/null 2>&1; then
|
|
wolf_count=$(pgrep -f "wolf-[0-9]" | wc -l | tr -d ' ')
|
|
log "WARNING: Untracked wolf-pack runtime detected — ${wolf_count} active processes (not in systemd/fleet tracking)"
|
|
# Not marked as failure — informational only for now
|
|
fi
|
|
|
|
# --- Heartbeat Touch ---
|
|
touch "${HEARTBEAT_DIR}/fleet_health.last"
|
|
|
|
if [[ "$status" -eq 0 ]]; then
|
|
log "Fleet health probe passed."
|
|
else
|
|
log "Fleet health probe FAILED."
|
|
fi
|
|
|
|
exit "$status"
|