- scripts/auto_restart_agent.sh — monitor and restart dead processes (3-attempt backoff) - scripts/backup_pipeline.sh — daily backups with retention + offsite rsync hook - scripts/telegram_thread_reporter.py — route messages to ops/burn/main threads - infrastructure/cron/*.crontab — scheduling for new automations
64 lines
2.1 KiB
Bash
64 lines
2.1 KiB
Bash
#!/usr/bin/env bash
|
|
# auto_restart_agent.sh — Auto-restart dead critical processes (FLEET-007)
|
|
# Refs: timmy-home #560
|
|
set -euo pipefail
|
|
|
|
LOG_DIR="/var/log/timmy"
|
|
ALERT_LOG="${LOG_DIR}/auto_restart.log"
|
|
STATE_DIR="/var/lib/timmy/restarts"
|
|
mkdir -p "$LOG_DIR" "$STATE_DIR"
|
|
|
|
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
|
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
|
|
|
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
|
|
|
|
send_telegram() {
|
|
local msg="$1"
|
|
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
|
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
|
|
fi
|
|
}
|
|
|
|
# Format: "process_name:command_to_restart"
|
|
# Override via AUTO_RESTART_PROCESSES env var
|
|
DEFAULT_PROCESSES="act_runner:cd /opt/gitea-runner && nohup ./act_runner daemon >/var/log/gitea-runner.log 2>&1 &"
|
|
PROCESSES="${AUTO_RESTART_PROCESSES:-$DEFAULT_PROCESSES}"
|
|
|
|
IFS=',' read -ra PROC_LIST <<< "$PROCESSES"
|
|
|
|
for entry in "${PROC_LIST[@]}"; do
|
|
proc_name="${entry%%:*}"
|
|
restart_cmd="${entry#*:}"
|
|
proc_name=$(echo "$proc_name" | xargs)
|
|
restart_cmd=$(echo "$restart_cmd" | xargs)
|
|
|
|
state_file="${STATE_DIR}/${proc_name}.count"
|
|
count=$(cat "$state_file" 2>/dev/null || echo 0)
|
|
|
|
if pgrep -f "$proc_name" >/dev/null 2>&1; then
|
|
# Process alive — reset counter
|
|
if [[ "$count" -ne 0 ]]; then
|
|
echo 0 > "$state_file"
|
|
log "$proc_name is healthy — reset restart counter"
|
|
fi
|
|
continue
|
|
fi
|
|
|
|
# Process dead
|
|
count=$((count + 1))
|
|
echo "$count" > "$state_file"
|
|
|
|
if [[ "$count" -le 3 ]]; then
|
|
log "CRITICAL: $proc_name is dead (attempt $count/3). Restarting..."
|
|
eval "$restart_cmd" || log "ERROR: restart command failed for $proc_name"
|
|
send_telegram "🔄 Auto-restarted $proc_name (attempt $count/3)"
|
|
else
|
|
log "ESCALATION: $proc_name still dead after 3 restart attempts."
|
|
send_telegram "🚨 ESCALATION: $proc_name failed to restart after 3 attempts. Manual intervention required."
|
|
fi
|
|
done
|
|
|
|
touch "${STATE_DIR}/auto_restart.last"
|