Files
timmy-home/scripts/auto_restart_agent.sh
Bezalel 39540a2a8c feat(infra): auto-restart agent, backup pipeline, Telegram thread reporter (#560, #561, #895)
- scripts/auto_restart_agent.sh — monitor and restart dead processes (3-attempt backoff)
- scripts/backup_pipeline.sh — daily backups with retention + offsite rsync hook
- scripts/telegram_thread_reporter.py — route messages to ops/burn/main threads
- infrastructure/cron/*.crontab — scheduling for new automations
2026-04-07 15:43:21 +00:00

64 lines
2.1 KiB
Bash

#!/usr/bin/env bash
# auto_restart_agent.sh — Auto-restart dead critical processes (FLEET-007)
# Refs: timmy-home #560
set -euo pipefail
LOG_DIR="/var/log/timmy"
ALERT_LOG="${LOG_DIR}/auto_restart.log"
STATE_DIR="/var/lib/timmy/restarts"
mkdir -p "$LOG_DIR" "$STATE_DIR"
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
send_telegram() {
local msg="$1"
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
fi
}
# Format: "process_name:command_to_restart"
# Override via AUTO_RESTART_PROCESSES env var
DEFAULT_PROCESSES="act_runner:cd /opt/gitea-runner && nohup ./act_runner daemon >/var/log/gitea-runner.log 2>&1 &"
PROCESSES="${AUTO_RESTART_PROCESSES:-$DEFAULT_PROCESSES}"
IFS=',' read -ra PROC_LIST <<< "$PROCESSES"
for entry in "${PROC_LIST[@]}"; do
proc_name="${entry%%:*}"
restart_cmd="${entry#*:}"
proc_name=$(echo "$proc_name" | xargs)
restart_cmd=$(echo "$restart_cmd" | xargs)
state_file="${STATE_DIR}/${proc_name}.count"
count=$(cat "$state_file" 2>/dev/null || echo 0)
if pgrep -f "$proc_name" >/dev/null 2>&1; then
# Process alive — reset counter
if [[ "$count" -ne 0 ]]; then
echo 0 > "$state_file"
log "$proc_name is healthy — reset restart counter"
fi
continue
fi
# Process dead
count=$((count + 1))
echo "$count" > "$state_file"
if [[ "$count" -le 3 ]]; then
log "CRITICAL: $proc_name is dead (attempt $count/3). Restarting..."
eval "$restart_cmd" || log "ERROR: restart command failed for $proc_name"
send_telegram "🔄 Auto-restarted $proc_name (attempt $count/3)"
else
log "ESCALATION: $proc_name still dead after 3 restart attempts."
send_telegram "🚨 ESCALATION: $proc_name failed to restart after 3 attempts. Manual intervention required."
fi
done
touch "${STATE_DIR}/auto_restart.last"