Files
the-nexus/fleet/fleet.sh
Alexander Whitestone 37b006d3c6
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
CI / validate (pull_request) Failing after 10s
feat: Fleet management (#910), retry logic (#896), morning report (#897)
- fleet/fleet.sh: cross-VPS health, status, restart, deploy
- nexus/retry_helper.py: retry decorator, dead letter queue, checkpoints
- nexus/morning_report.py: automated 0600 overnight activity report
- fleet/allegro/archived-scripts/README.md: burn script archive placeholder

Fixes #910
Fixes #896
Fixes #897
Fixes #898
2026-04-06 23:09:49 -04:00

122 lines
3.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# fleet.sh — Cross-VPS fleet management
# Manages both Allegro (167.99.126.228) and Bezalel (159.203.146.185)
# Usage: fleet.sh <command> [options]
#
# Commands:
# health — Run health checks on all VPSes
# restart <svc> — Restart a service on all VPSes
# status — Show fleet status summary
# ssh <host> — SSH into a specific host (allegro|bezalel)
# run <command> — Run a command on all VPSes
# deploy — Deploy latest config to all VPSes
set -euo pipefail
ALLEGRO="167.99.126.228"
BEZALEL="159.203.146.185"
EZRA="143.198.27.163"
USER="root"
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10"
hosts="$ALLEGRO $BEZALEL $EZRA"
host_names="allegro bezalel ezra"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FLEET: $*"; }
remote() {
local host=$1
shift
ssh $SSH_OPTS "$USER@$host" "$@"
}
cmd_health() {
log "Running fleet health check..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
echo ""
echo "=== $name ($host) ==="
if remote "$host" "echo 'SSH: OK'; uptime; free -m | head -2; df -h / | tail -1; systemctl list-units --state=failed --no-pager | head -10" 2>&1; then
echo "---"
else
echo "SSH: FAILED — host unreachable"
fi
done
}
cmd_status() {
log "Fleet status summary..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
printf "%-12s " "$name"
if remote "$host" "echo -n 'UP' 2>/dev/null" 2>/dev/null; then
uptime_str=$(remote "$host" "uptime -p 2>/dev/null || uptime" 2>/dev/null || echo "unknown")
echo " $uptime_str"
else
echo " UNREACHABLE"
fi
done
}
cmd_restart() {
local svc=${1:-}
if [ -z "$svc" ]; then
echo "Usage: fleet.sh restart <service>"
echo "Common: hermes-agent evennia nginx docker"
return 1
fi
log "Restarting '$svc' on all hosts..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
printf "%-12s " "$name"
if remote "$host" "systemctl restart $svc 2>&1 && echo 'restarted' || echo 'FAILED'" 2>/dev/null; then
echo ""
else
echo "UNREACHABLE"
fi
done
}
cmd_run() {
local cmd="${1:-}"
if [ -z "$cmd" ]; then
echo "Usage: fleet.sh run '<command>'"
return 1
fi
log "Running '$cmd' on all hosts..."
paste <(echo "$host_names" | tr ' ' '\n') <(echo "$hosts" | tr ' ' '\n') | while read name host; do
echo "=== $name ($host) ==="
remote "$host" "$cmd" 2>&1 || echo "(failed)"
echo ""
done
}
cmd_deploy() {
log "Deploying config to all hosts..."
# Push timmy-config updates to each host
for pair in "allegro:$ALLEGRO" "bezalel:$BEZALEL"; do
name="${pair%%:*}"
host="${pair##*:}"
echo ""
echo "=== $name ==="
remote "$host" "cd /root && ./update-config.sh 2>/dev/null || echo 'No update script found'; systemctl restart hermes-agent 2>/dev/null && echo 'hermes-agent restarted' || echo 'hermes-agent not found'" 2>&1 || echo "(unreachable)"
done
}
# Main dispatch
case "${1:-help}" in
health) cmd_health ;;
status) cmd_status ;;
restart) cmd_restart "${2:-}" ;;
run) cmd_run "${2:-}" ;;
deploy) cmd_deploy ;;
help|*)
echo "Usage: fleet.sh <command> [options]"
echo ""
echo "Commands:"
echo " health — Run health checks on all VPSes"
echo " status — Show fleet status summary"
echo " restart <svc> — Restart a service on all VPSes"
echo " run '<cmd>' — Run a command on all VPSes"
echo " deploy — Deploy config to all VPSes"
echo " ssh <host> — SSH into host (allegro|bezalel|ezra)"
;;
esac