- scripts/auto_restart_agent.sh — monitor and restart dead processes (3-attempt backoff) - scripts/backup_pipeline.sh — daily backups with retention + offsite rsync hook - scripts/telegram_thread_reporter.py — route messages to ops/burn/main threads - infrastructure/cron/*.crontab — scheduling for new automations
This commit is contained in:
63
scripts/auto_restart_agent.sh
Normal file
63
scripts/auto_restart_agent.sh
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
# auto_restart_agent.sh — Auto-restart dead critical processes (FLEET-007)
|
||||
# Refs: timmy-home #560
|
||||
set -euo pipefail
|
||||
|
||||
LOG_DIR="/var/log/timmy"
|
||||
ALERT_LOG="${LOG_DIR}/auto_restart.log"
|
||||
STATE_DIR="/var/lib/timmy/restarts"
|
||||
mkdir -p "$LOG_DIR" "$STATE_DIR"
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
|
||||
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
|
||||
|
||||
send_telegram() {
|
||||
local msg="$1"
|
||||
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
# Format: "process_name:command_to_restart"
|
||||
# Override via AUTO_RESTART_PROCESSES env var
|
||||
DEFAULT_PROCESSES="act_runner:cd /opt/gitea-runner && nohup ./act_runner daemon >/var/log/gitea-runner.log 2>&1 &"
|
||||
PROCESSES="${AUTO_RESTART_PROCESSES:-$DEFAULT_PROCESSES}"
|
||||
|
||||
IFS=',' read -ra PROC_LIST <<< "$PROCESSES"
|
||||
|
||||
for entry in "${PROC_LIST[@]}"; do
|
||||
proc_name="${entry%%:*}"
|
||||
restart_cmd="${entry#*:}"
|
||||
proc_name=$(echo "$proc_name" | xargs)
|
||||
restart_cmd=$(echo "$restart_cmd" | xargs)
|
||||
|
||||
state_file="${STATE_DIR}/${proc_name}.count"
|
||||
count=$(cat "$state_file" 2>/dev/null || echo 0)
|
||||
|
||||
if pgrep -f "$proc_name" >/dev/null 2>&1; then
|
||||
# Process alive — reset counter
|
||||
if [[ "$count" -ne 0 ]]; then
|
||||
echo 0 > "$state_file"
|
||||
log "$proc_name is healthy — reset restart counter"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
# Process dead
|
||||
count=$((count + 1))
|
||||
echo "$count" > "$state_file"
|
||||
|
||||
if [[ "$count" -le 3 ]]; then
|
||||
log "CRITICAL: $proc_name is dead (attempt $count/3). Restarting..."
|
||||
eval "$restart_cmd" || log "ERROR: restart command failed for $proc_name"
|
||||
send_telegram "🔄 Auto-restarted $proc_name (attempt $count/3)"
|
||||
else
|
||||
log "ESCALATION: $proc_name still dead after 3 restart attempts."
|
||||
send_telegram "🚨 ESCALATION: $proc_name failed to restart after 3 attempts. Manual intervention required."
|
||||
fi
|
||||
done
|
||||
|
||||
touch "${STATE_DIR}/auto_restart.last"
|
||||
80
scripts/backup_pipeline.sh
Normal file
80
scripts/backup_pipeline.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
# backup_pipeline.sh — Daily fleet backup pipeline (FLEET-008)
|
||||
# Refs: timmy-home #561
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_ROOT="/backups/timmy"
|
||||
DATESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
BACKUP_DIR="${BACKUP_ROOT}/${DATESTAMP}"
|
||||
LOG_DIR="/var/log/timmy"
|
||||
ALERT_LOG="${LOG_DIR}/backup_pipeline.log"
|
||||
mkdir -p "$BACKUP_DIR" "$LOG_DIR"
|
||||
|
||||
TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN:-}"
|
||||
TELEGRAM_CHAT_ID="${TELEGRAM_CHAT_ID:-}"
|
||||
OFFSITE_TARGET="${OFFSITE_TARGET:-}"
|
||||
|
||||
log() { echo "[$(date -Iseconds)] $1" | tee -a "$ALERT_LOG"; }
|
||||
|
||||
send_telegram() {
|
||||
local msg="$1"
|
||||
if [[ -n "$TELEGRAM_BOT_TOKEN" && -n "$TELEGRAM_CHAT_ID" ]]; then
|
||||
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT_ID}" -d "text=${msg}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
status=0
|
||||
|
||||
# --- Gitea repositories ---
|
||||
if [[ -d /root/gitea ]]; then
|
||||
tar czf "${BACKUP_DIR}/gitea-repos.tar.gz" -C /root gitea 2>/dev/null || true
|
||||
log "Backed up Gitea repos"
|
||||
fi
|
||||
|
||||
# --- Agent configs and state ---
|
||||
for wiz in bezalel allegro ezra timmy; do
|
||||
if [[ -d "/root/wizards/${wiz}" ]]; then
|
||||
tar czf "${BACKUP_DIR}/${wiz}-home.tar.gz" -C /root/wizards "${wiz}" 2>/dev/null || true
|
||||
log "Backed up ${wiz} home"
|
||||
fi
|
||||
done
|
||||
|
||||
# --- System configs ---
|
||||
cp /etc/crontab "${BACKUP_DIR}/crontab" 2>/dev/null || true
|
||||
cp -r /etc/systemd/system "${BACKUP_DIR}/systemd" 2>/dev/null || true
|
||||
log "Backed up system configs"
|
||||
|
||||
# --- Evennia worlds (if present) ---
|
||||
if [[ -d /root/evennia ]]; then
|
||||
tar czf "${BACKUP_DIR}/evennia-worlds.tar.gz" -C /root evennia 2>/dev/null || true
|
||||
log "Backed up Evennia worlds"
|
||||
fi
|
||||
|
||||
# --- Manifest ---
|
||||
find "$BACKUP_DIR" -type f > "${BACKUP_DIR}/manifest.txt"
|
||||
log "Backup manifest written"
|
||||
|
||||
# --- Offsite sync ---
|
||||
if [[ -n "$OFFSITE_TARGET" ]]; then
|
||||
if rsync -az --delete "${BACKUP_DIR}/" "${OFFSITE_TARGET}/${DATESTAMP}/" 2>/dev/null; then
|
||||
log "Offsite sync completed"
|
||||
else
|
||||
log "WARNING: Offsite sync failed"
|
||||
status=1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Retention: keep last 7 days ---
|
||||
find "$BACKUP_ROOT" -mindepth 1 -maxdepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null || true
|
||||
log "Retention applied (7 days)"
|
||||
|
||||
if [[ "$status" -eq 0 ]]; then
|
||||
log "Backup pipeline completed: ${BACKUP_DIR}"
|
||||
send_telegram "✅ Daily backup completed: ${DATESTAMP}"
|
||||
else
|
||||
log "Backup pipeline completed with WARNINGS: ${BACKUP_DIR}"
|
||||
send_telegram "⚠️ Daily backup completed with warnings: ${DATESTAMP}"
|
||||
fi
|
||||
|
||||
exit "$status"
|
||||
59
scripts/telegram_thread_reporter.py
Normal file
59
scripts/telegram_thread_reporter.py
Normal file
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
telegram_thread_reporter.py — Route reports to Telegram threads (#895)
|
||||
Usage:
|
||||
python telegram_thread_reporter.py --topic ops --message "Heartbeat OK"
|
||||
python telegram_thread_reporter.py --topic burn --message "Burn cycle done"
|
||||
python telegram_thread_reporter.py --topic main --message "Escalation!"
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import json
|
||||
|
||||
DEFAULT_THREADS = {
|
||||
"ops": os.environ.get("TELEGRAM_OPS_THREAD_ID"),
|
||||
"burn": os.environ.get("TELEGRAM_BURN_THREAD_ID"),
|
||||
"main": None, # main channel = no thread id
|
||||
}
|
||||
|
||||
|
||||
def send_message(bot_token: str, chat_id: str, text: str, thread_id: str | None = None):
|
||||
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
||||
data = {"chat_id": chat_id, "text": text, "parse_mode": "HTML"}
|
||||
if thread_id:
|
||||
data["message_thread_id"] = thread_id
|
||||
payload = urllib.parse.urlencode(data).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/x-www-form-urlencoded"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Telegram thread reporter")
|
||||
parser.add_argument("--topic", required=True, choices=["ops", "burn", "main"])
|
||||
parser.add_argument("--message", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN")
|
||||
chat_id = os.environ.get("TELEGRAM_CHAT_ID")
|
||||
if not bot_token or not chat_id:
|
||||
print("Missing TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
thread_id = DEFAULT_THREADS.get(args.topic)
|
||||
result = send_message(bot_token, chat_id, args.message, thread_id)
|
||||
if result.get("ok"):
|
||||
print(f"Sent to {args.topic}")
|
||||
else:
|
||||
print(f"Failed: {result}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user