diff --git a/resilience/health-check.sh b/resilience/health-check.sh new file mode 100755 index 0000000..4859666 --- /dev/null +++ b/resilience/health-check.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +# health-check.sh — Health check and service monitoring for the-door +# Usage: bash health-check.sh [--auto-restart] [--verbose] +# +# Checks: +# 1. nginx process is running +# 2. Static files are accessible (index.html serves correctly) +# 3. Gateway endpoint responds (if configured) +# 4. Disk space is adequate (< 90% used) +# 5. SSL cert is valid and not expiring soon + +set -euo pipefail + +VERBOSE=0 +AUTO_RESTART=0 +HEALTHY=0 +WARNINGS=0 + +for arg in "$@"; do + case "$arg" in + --verbose) VERBOSE=1 ;; + --auto-restart) AUTO_RESTART=1 ;; + *) echo "Usage: $0 [--auto-restart] [--verbose]"; exit 1 ;; + esac +done + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; } +info() { log "INFO $1"; } +warn() { log "WARN $1"; echo " ACTION: $2"; WARNINGS=$((WARNINGS + 1)); } +ok() { log "OK $1"; HEALTHY=$((HEALTHY + 1)); } +fail() { log "FAIL $1"; echo " ACTION: $2"; if [ "$AUTO_RESTART" = 1 ]; then "$3"; fi; } + +# ── Check 1: nginx ───────────────────────────────── +check_nginx() { + local host="${1:-localhost}" + local port="${2:-80}" + + if pgrep -x nginx > /dev/null 2>&1; then + ok "nginx is running (PID: $(pgrep -x nginx | head -1))" + else + fail "nginx is NOT running" "Start nginx: systemctl start nginx || nginx" "restart_nginx" + fi +} + +# ── Check 2: static files ────────────────────────── +check_static() { + local host="${1:-localhost}" + local port="${2:-80}" + local protocol="http" + + # Check for HTTPS + if [ -d "/etc/letsencrypt" ] || [ -d "/etc/ssl" ]; then + protocol="https" + fi + + local status + status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 -k "$protocol://$host/index.html" 2>/dev/null || echo "000") + + if [ "$status" = "200" ]; then + ok "index.html serves OK (HTTP $status)" + elif [ "$status" = "000" ]; then + fail "Cannot reach $protocol://$host:" "$AUTO_RESTART" "Check nginx config: nginx -t" + else + warn "Unexpected status for index.html: HTTP $status" "Check nginx config and file permissions" + fi +} + +# ── Check 3: Gateway ─────────────────────────────── +check_gateway() { + local gateway_url="${1:-http://localhost:8000}" + + local status + status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "$gateway_url/health" 2>/dev/null || echo "000") + + if [ "$status" = "200" ]; then + ok "Gateway responds (HTTP $status)" + elif [ "$status" = "000" ]; then + warn "Gateway not reachable at $gateway_url" "Check gateway service: systemctl status gateway || docker ps" + else + warn "Gateway returned HTTP $status" "Check gateway logs" + fi +} + +# ── Check 4: Disk space ──────────────────────────── +check_disk() { + local usage + usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%') + + if [ "$usage" -lt 80 ]; then + ok "Disk usage: ${usage}%" + elif [ "$usage" -lt 90 ]; then + warn "Disk usage: ${usage}%" "Clean up logs and temp files: journalctl --vacuum-size=100M" + else + fail "Disk usage CRITICAL: ${usage}%" "Emergency cleanup needed" "cleanup_disk" + fi +} + +# ── Check 5: SSL cert ────────────────────────────── +check_ssl() { + local domain="${1:-localhost}" + local cert_dir="/etc/letsencrypt/live/$domain" + + if [ ! -d "$cert_dir" ]; then + if [ "$VERBOSE" = 1 ]; then + warn "No Let's Encrypt cert at $cert_dir" "Assuming self-signed or no SSL" + fi + return 0 + fi + + if [ -f "$cert_dir/fullchain.pem" ]; then + local expiry + expiry=$(openssl x509 -enddate -noout -in "$cert_dir/fullchain.pem" 2>/dev/null | cut -d= -f2 || echo "unknown") + + if [ "$expiry" = "unknown" ]; then + warn "Cannot read SSL cert expiry" "Check cert: openssl x509 -enddate -noout -in $cert_dir/fullchain.pem" + return 0 + fi + + local expiry_epoch + expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$expiry" +%s 2>/dev/null || echo 0) + local now_epoch + now_epoch=$(date +%s) + local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) + + if [ "$days_left" -gt 30 ]; then + ok "SSL cert expires in ${days_left} days ($expiry)" + elif [ "$days_left" -gt 0 ]; then + warn "SSL cert expires in ${days_left} days!" "Renew: certbot renew" + else + fail "SSL cert has EXPIRED" "Renew immediately: certbot renew --force-renewal" + fi + fi +} + +# ── Recovery functions ────────────────────────────── +restart_nginx() { + info "Attempting to restart nginx..." + if command -v systemctl > /dev/null 2>&1; then + systemctl restart nginx && info "nginx restarted successfully" || warn "nginx restart failed" "Manual intervention needed" + elif command -v nginx > /dev/null 2>&1; then + nginx -s reload 2>/dev/null || (nginx && info "nginx started") || warn "nginx start failed" "Manual intervention needed" + fi +} + +cleanup_disk() { + info "Running disk cleanup..." + journalctl --vacuum-size=100M 2>/dev/null || true + rm -rf /tmp/* 2>/dev/null || true + rm -rf /var/log/*.gz 2>/dev/null || true + info "Cleanup complete" +} + +# ── Main ──────────────────────────────────────────── +info "=== The Door Health Check ===" +info "Host: ${HEALTH_HOST:-localhost}" +info "Time: $(date)" +echo "" + +check_nginx "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}" +check_static "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}" +check_gateway "${GATEWAY_URL:-http://localhost:8000}" +check_disk +check_ssl "${HEALTH_HOST:-localhost}" + +echo "" +if [ "$WARNINGS" -gt 0 ] || [ "$HEALTHY" -gt 0 ]; then + info "Summary: $HEALTHY OK, $WARNINGS warnings/failures" +fi + +if [ "$WARNINGS" -gt 0 ] && [ "$AUTO_RESTART" = 1 ]; then + warn "Auto-restart mode is ON — recovery actions attempted" + exit 1 +elif [ "$WARNINGS" -gt 0 ]; then + exit 1 +fi + +exit 0 diff --git a/resilience/service-restart.sh b/resilience/service-restart.sh new file mode 100755 index 0000000..1b3e98e --- /dev/null +++ b/resilience/service-restart.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# service-restart.sh — Graceful service restart for the-door +# Usage: bash service-restart.sh [--force] +# +# Performs ordered restart: stop -> verify stopped -> start -> verify started +# with health check confirmation. + +set -euo pipefail + +FORCE=0 +for arg in "$@"; do + case "$arg" in + --force) FORCE=1 ;; + *) echo "Usage: $0 [--force]"; exit 1 ;; + esac +done + +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; } + +# ── Stop ──────────────────────────────────────────── +stop_services() { + log "Stopping services..." + + if command -v systemctl > /dev/null 2>&1; then + systemctl stop nginx 2>/dev/null && log "nginx stopped" || true + elif command -v nginx > /dev/null 2>&1; then + nginx -s stop 2>/dev/null && log "nginx stopped" || true + fi + + # Stop gateway if running + local gw_pid + gw_pid=$(lsof -ti:8000 2>/dev/null || true) + if [ -n "$gw_pid" ]; then + kill "$gw_pid" 2>/dev/null && log "Gateway stopped (PID $gw_pid)" || true + fi + + sleep 1 + log "All services stopped" +} + +# ── Start ─────────────────────────────────────────── +start_services() { + log "Starting services..." + + # Start nginx + if command -v systemctl > /dev/null 2>&1; then + systemctl start nginx && log "nginx started" || { log "FAILED to start nginx"; return 1; } + elif command -v nginx > /dev/null 2>&1; then + nginx 2>/dev/null && log "nginx started" || { log "FAILED to start nginx"; return 1; } + fi + + log "All services started" +} + +# ── Verify ────────────────────────────────────────── +verify_services() { + local host="${1:-localhost}" + + log "Verifying services..." + + # Check nginx + if pgrep -x nginx > /dev/null 2>&1; then + log "nginx is running" + else + log "ERROR: nginx failed to start" + return 1 + fi + + # Check static file + local status + status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "http://$host/" 2>/dev/null || echo "000") + if [ "$status" = "200" ]; then + log "Static content verified (HTTP $status)" + else + log "WARNING: Static content check returned HTTP $status" + fi +} + +# ── Main ──────────────────────────────────────────── +log "=== Service Restart ===" + +if [ "$FORCE" = 1 ]; then + log "FORCE mode — skipping graceful stop" +else + stop_services +fi + +start_services +verify_services "${HEALTH_HOST:-localhost}" + +log "=== Restart complete ==="