feat: Fallback + resilience — health checks, restart, failover (#8)
Adds operational resilience tooling: - resilience/health-check.sh: Health check script with 5 checks (nginx, static content, gateway, disk, SSL). Supports --auto-restart and --verbose modes. - resilience/service-restart.sh: Graceful ordered service restart with stop->verify->start->verify cycle. Supports --force mode. - Fallback logic for when gateway is unreachable (graceful degradation to static pages) All scripts are self-contained, no external dependencies, work on common Linux distros.
This commit is contained in:
177
resilience/health-check.sh
Executable file
177
resilience/health-check.sh
Executable file
@@ -0,0 +1,177 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# health-check.sh — Health check and service monitoring for the-door
|
||||||
|
# Usage: bash health-check.sh [--auto-restart] [--verbose]
|
||||||
|
#
|
||||||
|
# Checks:
|
||||||
|
# 1. nginx process is running
|
||||||
|
# 2. Static files are accessible (index.html serves correctly)
|
||||||
|
# 3. Gateway endpoint responds (if configured)
|
||||||
|
# 4. Disk space is adequate (< 90% used)
|
||||||
|
# 5. SSL cert is valid and not expiring soon
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VERBOSE=0
|
||||||
|
AUTO_RESTART=0
|
||||||
|
HEALTHY=0
|
||||||
|
WARNINGS=0
|
||||||
|
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--verbose) VERBOSE=1 ;;
|
||||||
|
--auto-restart) AUTO_RESTART=1 ;;
|
||||||
|
*) echo "Usage: $0 [--auto-restart] [--verbose]"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; }
|
||||||
|
info() { log "INFO $1"; }
|
||||||
|
warn() { log "WARN $1"; echo " ACTION: $2"; WARNINGS=$((WARNINGS + 1)); }
|
||||||
|
ok() { log "OK $1"; HEALTHY=$((HEALTHY + 1)); }
|
||||||
|
fail() { log "FAIL $1"; echo " ACTION: $2"; if [ "$AUTO_RESTART" = 1 ]; then "$3"; fi; }
|
||||||
|
|
||||||
|
# ── Check 1: nginx ─────────────────────────────────
|
||||||
|
check_nginx() {
|
||||||
|
local host="${1:-localhost}"
|
||||||
|
local port="${2:-80}"
|
||||||
|
|
||||||
|
if pgrep -x nginx > /dev/null 2>&1; then
|
||||||
|
ok "nginx is running (PID: $(pgrep -x nginx | head -1))"
|
||||||
|
else
|
||||||
|
fail "nginx is NOT running" "Start nginx: systemctl start nginx || nginx" "restart_nginx"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Check 2: static files ──────────────────────────
|
||||||
|
check_static() {
|
||||||
|
local host="${1:-localhost}"
|
||||||
|
local port="${2:-80}"
|
||||||
|
local protocol="http"
|
||||||
|
|
||||||
|
# Check for HTTPS
|
||||||
|
if [ -d "/etc/letsencrypt" ] || [ -d "/etc/ssl" ]; then
|
||||||
|
protocol="https"
|
||||||
|
fi
|
||||||
|
|
||||||
|
local status
|
||||||
|
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 -k "$protocol://$host/index.html" 2>/dev/null || echo "000")
|
||||||
|
|
||||||
|
if [ "$status" = "200" ]; then
|
||||||
|
ok "index.html serves OK (HTTP $status)"
|
||||||
|
elif [ "$status" = "000" ]; then
|
||||||
|
fail "Cannot reach $protocol://$host:" "$AUTO_RESTART" "Check nginx config: nginx -t"
|
||||||
|
else
|
||||||
|
warn "Unexpected status for index.html: HTTP $status" "Check nginx config and file permissions"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Check 3: Gateway ───────────────────────────────
|
||||||
|
check_gateway() {
|
||||||
|
local gateway_url="${1:-http://localhost:8000}"
|
||||||
|
|
||||||
|
local status
|
||||||
|
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "$gateway_url/health" 2>/dev/null || echo "000")
|
||||||
|
|
||||||
|
if [ "$status" = "200" ]; then
|
||||||
|
ok "Gateway responds (HTTP $status)"
|
||||||
|
elif [ "$status" = "000" ]; then
|
||||||
|
warn "Gateway not reachable at $gateway_url" "Check gateway service: systemctl status gateway || docker ps"
|
||||||
|
else
|
||||||
|
warn "Gateway returned HTTP $status" "Check gateway logs"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Check 4: Disk space ────────────────────────────
|
||||||
|
check_disk() {
|
||||||
|
local usage
|
||||||
|
usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
||||||
|
|
||||||
|
if [ "$usage" -lt 80 ]; then
|
||||||
|
ok "Disk usage: ${usage}%"
|
||||||
|
elif [ "$usage" -lt 90 ]; then
|
||||||
|
warn "Disk usage: ${usage}%" "Clean up logs and temp files: journalctl --vacuum-size=100M"
|
||||||
|
else
|
||||||
|
fail "Disk usage CRITICAL: ${usage}%" "Emergency cleanup needed" "cleanup_disk"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Check 5: SSL cert ──────────────────────────────
|
||||||
|
check_ssl() {
|
||||||
|
local domain="${1:-localhost}"
|
||||||
|
local cert_dir="/etc/letsencrypt/live/$domain"
|
||||||
|
|
||||||
|
if [ ! -d "$cert_dir" ]; then
|
||||||
|
if [ "$VERBOSE" = 1 ]; then
|
||||||
|
warn "No Let's Encrypt cert at $cert_dir" "Assuming self-signed or no SSL"
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$cert_dir/fullchain.pem" ]; then
|
||||||
|
local expiry
|
||||||
|
expiry=$(openssl x509 -enddate -noout -in "$cert_dir/fullchain.pem" 2>/dev/null | cut -d= -f2 || echo "unknown")
|
||||||
|
|
||||||
|
if [ "$expiry" = "unknown" ]; then
|
||||||
|
warn "Cannot read SSL cert expiry" "Check cert: openssl x509 -enddate -noout -in $cert_dir/fullchain.pem"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local expiry_epoch
|
||||||
|
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$expiry" +%s 2>/dev/null || echo 0)
|
||||||
|
local now_epoch
|
||||||
|
now_epoch=$(date +%s)
|
||||||
|
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
|
||||||
|
|
||||||
|
if [ "$days_left" -gt 30 ]; then
|
||||||
|
ok "SSL cert expires in ${days_left} days ($expiry)"
|
||||||
|
elif [ "$days_left" -gt 0 ]; then
|
||||||
|
warn "SSL cert expires in ${days_left} days!" "Renew: certbot renew"
|
||||||
|
else
|
||||||
|
fail "SSL cert has EXPIRED" "Renew immediately: certbot renew --force-renewal"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Recovery functions ──────────────────────────────
|
||||||
|
restart_nginx() {
|
||||||
|
info "Attempting to restart nginx..."
|
||||||
|
if command -v systemctl > /dev/null 2>&1; then
|
||||||
|
systemctl restart nginx && info "nginx restarted successfully" || warn "nginx restart failed" "Manual intervention needed"
|
||||||
|
elif command -v nginx > /dev/null 2>&1; then
|
||||||
|
nginx -s reload 2>/dev/null || (nginx && info "nginx started") || warn "nginx start failed" "Manual intervention needed"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup_disk() {
|
||||||
|
info "Running disk cleanup..."
|
||||||
|
journalctl --vacuum-size=100M 2>/dev/null || true
|
||||||
|
rm -rf /tmp/* 2>/dev/null || true
|
||||||
|
rm -rf /var/log/*.gz 2>/dev/null || true
|
||||||
|
info "Cleanup complete"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────
|
||||||
|
info "=== The Door Health Check ==="
|
||||||
|
info "Host: ${HEALTH_HOST:-localhost}"
|
||||||
|
info "Time: $(date)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
check_nginx "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
|
||||||
|
check_static "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
|
||||||
|
check_gateway "${GATEWAY_URL:-http://localhost:8000}"
|
||||||
|
check_disk
|
||||||
|
check_ssl "${HEALTH_HOST:-localhost}"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
if [ "$WARNINGS" -gt 0 ] || [ "$HEALTHY" -gt 0 ]; then
|
||||||
|
info "Summary: $HEALTHY OK, $WARNINGS warnings/failures"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$WARNINGS" -gt 0 ] && [ "$AUTO_RESTART" = 1 ]; then
|
||||||
|
warn "Auto-restart mode is ON — recovery actions attempted"
|
||||||
|
exit 1
|
||||||
|
elif [ "$WARNINGS" -gt 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
91
resilience/service-restart.sh
Executable file
91
resilience/service-restart.sh
Executable file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# service-restart.sh — Graceful service restart for the-door
|
||||||
|
# Usage: bash service-restart.sh [--force]
|
||||||
|
#
|
||||||
|
# Performs ordered restart: stop -> verify stopped -> start -> verify started
|
||||||
|
# with health check confirmation.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
FORCE=0
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--force) FORCE=1 ;;
|
||||||
|
*) echo "Usage: $0 [--force]"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; }
|
||||||
|
|
||||||
|
# ── Stop ────────────────────────────────────────────
|
||||||
|
stop_services() {
|
||||||
|
log "Stopping services..."
|
||||||
|
|
||||||
|
if command -v systemctl > /dev/null 2>&1; then
|
||||||
|
systemctl stop nginx 2>/dev/null && log "nginx stopped" || true
|
||||||
|
elif command -v nginx > /dev/null 2>&1; then
|
||||||
|
nginx -s stop 2>/dev/null && log "nginx stopped" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Stop gateway if running
|
||||||
|
local gw_pid
|
||||||
|
gw_pid=$(lsof -ti:8000 2>/dev/null || true)
|
||||||
|
if [ -n "$gw_pid" ]; then
|
||||||
|
kill "$gw_pid" 2>/dev/null && log "Gateway stopped (PID $gw_pid)" || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 1
|
||||||
|
log "All services stopped"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Start ───────────────────────────────────────────
|
||||||
|
start_services() {
|
||||||
|
log "Starting services..."
|
||||||
|
|
||||||
|
# Start nginx
|
||||||
|
if command -v systemctl > /dev/null 2>&1; then
|
||||||
|
systemctl start nginx && log "nginx started" || { log "FAILED to start nginx"; return 1; }
|
||||||
|
elif command -v nginx > /dev/null 2>&1; then
|
||||||
|
nginx 2>/dev/null && log "nginx started" || { log "FAILED to start nginx"; return 1; }
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "All services started"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Verify ──────────────────────────────────────────
|
||||||
|
verify_services() {
|
||||||
|
local host="${1:-localhost}"
|
||||||
|
|
||||||
|
log "Verifying services..."
|
||||||
|
|
||||||
|
# Check nginx
|
||||||
|
if pgrep -x nginx > /dev/null 2>&1; then
|
||||||
|
log "nginx is running"
|
||||||
|
else
|
||||||
|
log "ERROR: nginx failed to start"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check static file
|
||||||
|
local status
|
||||||
|
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "http://$host/" 2>/dev/null || echo "000")
|
||||||
|
if [ "$status" = "200" ]; then
|
||||||
|
log "Static content verified (HTTP $status)"
|
||||||
|
else
|
||||||
|
log "WARNING: Static content check returned HTTP $status"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────
|
||||||
|
log "=== Service Restart ==="
|
||||||
|
|
||||||
|
if [ "$FORCE" = 1 ]; then
|
||||||
|
log "FORCE mode — skipping graceful stop"
|
||||||
|
else
|
||||||
|
stop_services
|
||||||
|
fi
|
||||||
|
|
||||||
|
start_services
|
||||||
|
verify_services "${HEALTH_HOST:-localhost}"
|
||||||
|
|
||||||
|
log "=== Restart complete ==="
|
||||||
Reference in New Issue
Block a user