Merge branch 'feature/resilience'

This commit is contained in:
Alexander Whitestone
2026-04-05 17:25:26 -04:00
2 changed files with 268 additions and 0 deletions

177
resilience/health-check.sh Executable file
View File

@@ -0,0 +1,177 @@
#!/usr/bin/env bash
# health-check.sh — Health check and service monitoring for the-door
# Usage: bash health-check.sh [--auto-restart] [--verbose]
#
# Checks:
# 1. nginx process is running
# 2. Static files are accessible (index.html serves correctly)
# 3. Gateway endpoint responds (if configured)
# 4. Disk space is adequate (< 90% used)
# 5. SSL cert is valid and not expiring soon
set -euo pipefail
VERBOSE=0
AUTO_RESTART=0
HEALTHY=0
WARNINGS=0
for arg in "$@"; do
case "$arg" in
--verbose) VERBOSE=1 ;;
--auto-restart) AUTO_RESTART=1 ;;
*) echo "Usage: $0 [--auto-restart] [--verbose]"; exit 1 ;;
esac
done
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; }
info() { log "INFO $1"; }
warn() { log "WARN $1"; echo " ACTION: $2"; WARNINGS=$((WARNINGS + 1)); }
ok() { log "OK $1"; HEALTHY=$((HEALTHY + 1)); }
fail() { log "FAIL $1"; echo " ACTION: $2"; if [ "$AUTO_RESTART" = 1 ]; then "$3"; fi; }
# ── Check 1: nginx ─────────────────────────────────
check_nginx() {
local host="${1:-localhost}"
local port="${2:-80}"
if pgrep -x nginx > /dev/null 2>&1; then
ok "nginx is running (PID: $(pgrep -x nginx | head -1))"
else
fail "nginx is NOT running" "Start nginx: systemctl start nginx || nginx" "restart_nginx"
fi
}
# ── Check 2: static files ──────────────────────────
check_static() {
local host="${1:-localhost}"
local port="${2:-80}"
local protocol="http"
# Check for HTTPS
if [ -d "/etc/letsencrypt" ] || [ -d "/etc/ssl" ]; then
protocol="https"
fi
local status
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 -k "$protocol://$host/index.html" 2>/dev/null || echo "000")
if [ "$status" = "200" ]; then
ok "index.html serves OK (HTTP $status)"
elif [ "$status" = "000" ]; then
fail "Cannot reach $protocol://$host:" "$AUTO_RESTART" "Check nginx config: nginx -t"
else
warn "Unexpected status for index.html: HTTP $status" "Check nginx config and file permissions"
fi
}
# ── Check 3: Gateway ───────────────────────────────
check_gateway() {
local gateway_url="${1:-http://localhost:8000}"
local status
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "$gateway_url/health" 2>/dev/null || echo "000")
if [ "$status" = "200" ]; then
ok "Gateway responds (HTTP $status)"
elif [ "$status" = "000" ]; then
warn "Gateway not reachable at $gateway_url" "Check gateway service: systemctl status gateway || docker ps"
else
warn "Gateway returned HTTP $status" "Check gateway logs"
fi
}
# ── Check 4: Disk space ────────────────────────────
check_disk() {
local usage
usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
if [ "$usage" -lt 80 ]; then
ok "Disk usage: ${usage}%"
elif [ "$usage" -lt 90 ]; then
warn "Disk usage: ${usage}%" "Clean up logs and temp files: journalctl --vacuum-size=100M"
else
fail "Disk usage CRITICAL: ${usage}%" "Emergency cleanup needed" "cleanup_disk"
fi
}
# ── Check 5: SSL cert ──────────────────────────────
check_ssl() {
local domain="${1:-localhost}"
local cert_dir="/etc/letsencrypt/live/$domain"
if [ ! -d "$cert_dir" ]; then
if [ "$VERBOSE" = 1 ]; then
warn "No Let's Encrypt cert at $cert_dir" "Assuming self-signed or no SSL"
fi
return 0
fi
if [ -f "$cert_dir/fullchain.pem" ]; then
local expiry
expiry=$(openssl x509 -enddate -noout -in "$cert_dir/fullchain.pem" 2>/dev/null | cut -d= -f2 || echo "unknown")
if [ "$expiry" = "unknown" ]; then
warn "Cannot read SSL cert expiry" "Check cert: openssl x509 -enddate -noout -in $cert_dir/fullchain.pem"
return 0
fi
local expiry_epoch
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$expiry" +%s 2>/dev/null || echo 0)
local now_epoch
now_epoch=$(date +%s)
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
if [ "$days_left" -gt 30 ]; then
ok "SSL cert expires in ${days_left} days ($expiry)"
elif [ "$days_left" -gt 0 ]; then
warn "SSL cert expires in ${days_left} days!" "Renew: certbot renew"
else
fail "SSL cert has EXPIRED" "Renew immediately: certbot renew --force-renewal"
fi
fi
}
# ── Recovery functions ──────────────────────────────
restart_nginx() {
info "Attempting to restart nginx..."
if command -v systemctl > /dev/null 2>&1; then
systemctl restart nginx && info "nginx restarted successfully" || warn "nginx restart failed" "Manual intervention needed"
elif command -v nginx > /dev/null 2>&1; then
nginx -s reload 2>/dev/null || (nginx && info "nginx started") || warn "nginx start failed" "Manual intervention needed"
fi
}
cleanup_disk() {
info "Running disk cleanup..."
journalctl --vacuum-size=100M 2>/dev/null || true
rm -rf /tmp/* 2>/dev/null || true
rm -rf /var/log/*.gz 2>/dev/null || true
info "Cleanup complete"
}
# ── Main ────────────────────────────────────────────
info "=== The Door Health Check ==="
info "Host: ${HEALTH_HOST:-localhost}"
info "Time: $(date)"
echo ""
check_nginx "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
check_static "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
check_gateway "${GATEWAY_URL:-http://localhost:8000}"
check_disk
check_ssl "${HEALTH_HOST:-localhost}"
echo ""
if [ "$WARNINGS" -gt 0 ] || [ "$HEALTHY" -gt 0 ]; then
info "Summary: $HEALTHY OK, $WARNINGS warnings/failures"
fi
if [ "$WARNINGS" -gt 0 ] && [ "$AUTO_RESTART" = 1 ]; then
warn "Auto-restart mode is ON — recovery actions attempted"
exit 1
elif [ "$WARNINGS" -gt 0 ]; then
exit 1
fi
exit 0

91
resilience/service-restart.sh Executable file
View File

@@ -0,0 +1,91 @@
#!/usr/bin/env bash
# service-restart.sh — Graceful service restart for the-door
# Usage: bash service-restart.sh [--force]
#
# Performs ordered restart: stop -> verify stopped -> start -> verify started
# with health check confirmation.
set -euo pipefail
FORCE=0
for arg in "$@"; do
case "$arg" in
--force) FORCE=1 ;;
*) echo "Usage: $0 [--force]"; exit 1 ;;
esac
done
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; }
# ── Stop ────────────────────────────────────────────
stop_services() {
log "Stopping services..."
if command -v systemctl > /dev/null 2>&1; then
systemctl stop nginx 2>/dev/null && log "nginx stopped" || true
elif command -v nginx > /dev/null 2>&1; then
nginx -s stop 2>/dev/null && log "nginx stopped" || true
fi
# Stop gateway if running
local gw_pid
gw_pid=$(lsof -ti:8000 2>/dev/null || true)
if [ -n "$gw_pid" ]; then
kill "$gw_pid" 2>/dev/null && log "Gateway stopped (PID $gw_pid)" || true
fi
sleep 1
log "All services stopped"
}
# ── Start ───────────────────────────────────────────
start_services() {
log "Starting services..."
# Start nginx
if command -v systemctl > /dev/null 2>&1; then
systemctl start nginx && log "nginx started" || { log "FAILED to start nginx"; return 1; }
elif command -v nginx > /dev/null 2>&1; then
nginx 2>/dev/null && log "nginx started" || { log "FAILED to start nginx"; return 1; }
fi
log "All services started"
}
# ── Verify ──────────────────────────────────────────
verify_services() {
local host="${1:-localhost}"
log "Verifying services..."
# Check nginx
if pgrep -x nginx > /dev/null 2>&1; then
log "nginx is running"
else
log "ERROR: nginx failed to start"
return 1
fi
# Check static file
local status
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "http://$host/" 2>/dev/null || echo "000")
if [ "$status" = "200" ]; then
log "Static content verified (HTTP $status)"
else
log "WARNING: Static content check returned HTTP $status"
fi
}
# ── Main ────────────────────────────────────────────
log "=== Service Restart ==="
if [ "$FORCE" = 1 ]; then
log "FORCE mode — skipping graceful stop"
else
stop_services
fi
start_services
verify_services "${HEALTH_HOST:-localhost}"
log "=== Restart complete ==="