Adds operational resilience tooling: - resilience/health-check.sh: Health check script with 5 checks (nginx, static content, gateway, disk, SSL). Supports --auto-restart and --verbose modes. - resilience/service-restart.sh: Graceful ordered service restart with stop->verify->start->verify cycle. Supports --force mode. - Fallback logic for when gateway is unreachable (graceful degradation to static pages) All scripts are self-contained, no external dependencies, work on common Linux distros.
178 lines
5.9 KiB
Bash
Executable File
178 lines
5.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# health-check.sh — Health check and service monitoring for the-door
|
|
# Usage: bash health-check.sh [--auto-restart] [--verbose]
|
|
#
|
|
# Checks:
|
|
# 1. nginx process is running
|
|
# 2. Static files are accessible (index.html serves correctly)
|
|
# 3. Gateway endpoint responds (if configured)
|
|
# 4. Disk space is adequate (< 90% used)
|
|
# 5. SSL cert is valid and not expiring soon
|
|
|
|
set -euo pipefail
|
|
|
|
VERBOSE=0
|
|
AUTO_RESTART=0
|
|
HEALTHY=0
|
|
WARNINGS=0
|
|
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--verbose) VERBOSE=1 ;;
|
|
--auto-restart) AUTO_RESTART=1 ;;
|
|
*) echo "Usage: $0 [--auto-restart] [--verbose]"; exit 1 ;;
|
|
esac
|
|
done
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; }
|
|
info() { log "INFO $1"; }
|
|
warn() { log "WARN $1"; echo " ACTION: $2"; WARNINGS=$((WARNINGS + 1)); }
|
|
ok() { log "OK $1"; HEALTHY=$((HEALTHY + 1)); }
|
|
fail() { log "FAIL $1"; echo " ACTION: $2"; if [ "$AUTO_RESTART" = 1 ]; then "$3"; fi; }
|
|
|
|
# ── Check 1: nginx ─────────────────────────────────
|
|
check_nginx() {
|
|
local host="${1:-localhost}"
|
|
local port="${2:-80}"
|
|
|
|
if pgrep -x nginx > /dev/null 2>&1; then
|
|
ok "nginx is running (PID: $(pgrep -x nginx | head -1))"
|
|
else
|
|
fail "nginx is NOT running" "Start nginx: systemctl start nginx || nginx" "restart_nginx"
|
|
fi
|
|
}
|
|
|
|
# ── Check 2: static files ──────────────────────────
|
|
check_static() {
|
|
local host="${1:-localhost}"
|
|
local port="${2:-80}"
|
|
local protocol="http"
|
|
|
|
# Check for HTTPS
|
|
if [ -d "/etc/letsencrypt" ] || [ -d "/etc/ssl" ]; then
|
|
protocol="https"
|
|
fi
|
|
|
|
local status
|
|
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 -k "$protocol://$host/index.html" 2>/dev/null || echo "000")
|
|
|
|
if [ "$status" = "200" ]; then
|
|
ok "index.html serves OK (HTTP $status)"
|
|
elif [ "$status" = "000" ]; then
|
|
fail "Cannot reach $protocol://$host:" "$AUTO_RESTART" "Check nginx config: nginx -t"
|
|
else
|
|
warn "Unexpected status for index.html: HTTP $status" "Check nginx config and file permissions"
|
|
fi
|
|
}
|
|
|
|
# ── Check 3: Gateway ───────────────────────────────
|
|
check_gateway() {
|
|
local gateway_url="${1:-http://localhost:8000}"
|
|
|
|
local status
|
|
status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "$gateway_url/health" 2>/dev/null || echo "000")
|
|
|
|
if [ "$status" = "200" ]; then
|
|
ok "Gateway responds (HTTP $status)"
|
|
elif [ "$status" = "000" ]; then
|
|
warn "Gateway not reachable at $gateway_url" "Check gateway service: systemctl status gateway || docker ps"
|
|
else
|
|
warn "Gateway returned HTTP $status" "Check gateway logs"
|
|
fi
|
|
}
|
|
|
|
# ── Check 4: Disk space ────────────────────────────
|
|
check_disk() {
|
|
local usage
|
|
usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
|
|
|
if [ "$usage" -lt 80 ]; then
|
|
ok "Disk usage: ${usage}%"
|
|
elif [ "$usage" -lt 90 ]; then
|
|
warn "Disk usage: ${usage}%" "Clean up logs and temp files: journalctl --vacuum-size=100M"
|
|
else
|
|
fail "Disk usage CRITICAL: ${usage}%" "Emergency cleanup needed" "cleanup_disk"
|
|
fi
|
|
}
|
|
|
|
# ── Check 5: SSL cert ──────────────────────────────
|
|
check_ssl() {
|
|
local domain="${1:-localhost}"
|
|
local cert_dir="/etc/letsencrypt/live/$domain"
|
|
|
|
if [ ! -d "$cert_dir" ]; then
|
|
if [ "$VERBOSE" = 1 ]; then
|
|
warn "No Let's Encrypt cert at $cert_dir" "Assuming self-signed or no SSL"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
if [ -f "$cert_dir/fullchain.pem" ]; then
|
|
local expiry
|
|
expiry=$(openssl x509 -enddate -noout -in "$cert_dir/fullchain.pem" 2>/dev/null | cut -d= -f2 || echo "unknown")
|
|
|
|
if [ "$expiry" = "unknown" ]; then
|
|
warn "Cannot read SSL cert expiry" "Check cert: openssl x509 -enddate -noout -in $cert_dir/fullchain.pem"
|
|
return 0
|
|
fi
|
|
|
|
local expiry_epoch
|
|
expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$expiry" +%s 2>/dev/null || echo 0)
|
|
local now_epoch
|
|
now_epoch=$(date +%s)
|
|
local days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
|
|
|
|
if [ "$days_left" -gt 30 ]; then
|
|
ok "SSL cert expires in ${days_left} days ($expiry)"
|
|
elif [ "$days_left" -gt 0 ]; then
|
|
warn "SSL cert expires in ${days_left} days!" "Renew: certbot renew"
|
|
else
|
|
fail "SSL cert has EXPIRED" "Renew immediately: certbot renew --force-renewal"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ── Recovery functions ──────────────────────────────
|
|
restart_nginx() {
|
|
info "Attempting to restart nginx..."
|
|
if command -v systemctl > /dev/null 2>&1; then
|
|
systemctl restart nginx && info "nginx restarted successfully" || warn "nginx restart failed" "Manual intervention needed"
|
|
elif command -v nginx > /dev/null 2>&1; then
|
|
nginx -s reload 2>/dev/null || (nginx && info "nginx started") || warn "nginx start failed" "Manual intervention needed"
|
|
fi
|
|
}
|
|
|
|
cleanup_disk() {
|
|
info "Running disk cleanup..."
|
|
journalctl --vacuum-size=100M 2>/dev/null || true
|
|
rm -rf /tmp/* 2>/dev/null || true
|
|
rm -rf /var/log/*.gz 2>/dev/null || true
|
|
info "Cleanup complete"
|
|
}
|
|
|
|
# ── Main ────────────────────────────────────────────
|
|
info "=== The Door Health Check ==="
|
|
info "Host: ${HEALTH_HOST:-localhost}"
|
|
info "Time: $(date)"
|
|
echo ""
|
|
|
|
check_nginx "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
|
|
check_static "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}"
|
|
check_gateway "${GATEWAY_URL:-http://localhost:8000}"
|
|
check_disk
|
|
check_ssl "${HEALTH_HOST:-localhost}"
|
|
|
|
echo ""
|
|
if [ "$WARNINGS" -gt 0 ] || [ "$HEALTHY" -gt 0 ]; then
|
|
info "Summary: $HEALTHY OK, $WARNINGS warnings/failures"
|
|
fi
|
|
|
|
if [ "$WARNINGS" -gt 0 ] && [ "$AUTO_RESTART" = 1 ]; then
|
|
warn "Auto-restart mode is ON — recovery actions attempted"
|
|
exit 1
|
|
elif [ "$WARNINGS" -gt 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
exit 0
|