#!/usr/bin/env bash # health-check.sh — Health check and service monitoring for the-door # Usage: bash health-check.sh [--auto-restart] [--verbose] # # Checks: # 1. nginx process is running # 2. Static files are accessible (index.html serves correctly) # 3. Gateway endpoint responds (if configured) # 4. Disk space is adequate (< 90% used) # 5. SSL cert is valid and not expiring soon set -euo pipefail VERBOSE=0 AUTO_RESTART=0 HEALTHY=0 WARNINGS=0 for arg in "$@"; do case "$arg" in --verbose) VERBOSE=1 ;; --auto-restart) AUTO_RESTART=1 ;; *) echo "Usage: $0 [--auto-restart] [--verbose]"; exit 1 ;; esac done log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"; } info() { log "INFO $1"; } warn() { log "WARN $1"; echo " ACTION: $2"; WARNINGS=$((WARNINGS + 1)); } ok() { log "OK $1"; HEALTHY=$((HEALTHY + 1)); } fail() { log "FAIL $1"; echo " ACTION: $2"; if [ "$AUTO_RESTART" = 1 ]; then "$3"; fi; } # ── Check 1: nginx ───────────────────────────────── check_nginx() { local host="${1:-localhost}" local port="${2:-80}" if pgrep -x nginx > /dev/null 2>&1; then ok "nginx is running (PID: $(pgrep -x nginx | head -1))" else fail "nginx is NOT running" "Start nginx: systemctl start nginx || nginx" "restart_nginx" fi } # ── Check 2: static files ────────────────────────── check_static() { local host="${1:-localhost}" local port="${2:-80}" local protocol="http" # Check for HTTPS if [ -d "/etc/letsencrypt" ] || [ -d "/etc/ssl" ]; then protocol="https" fi local status status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 -k "$protocol://$host/index.html" 2>/dev/null || echo "000") if [ "$status" = "200" ]; then ok "index.html serves OK (HTTP $status)" elif [ "$status" = "000" ]; then fail "Cannot reach $protocol://$host:" "$AUTO_RESTART" "Check nginx config: nginx -t" else warn "Unexpected status for index.html: HTTP $status" "Check nginx config and file permissions" fi } # ── Check 3: Gateway ─────────────────────────────── check_gateway() { local gateway_url="${1:-http://localhost:8000}" local status status=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 "$gateway_url/health" 2>/dev/null || echo "000") if [ "$status" = "200" ]; then ok "Gateway responds (HTTP $status)" elif [ "$status" = "000" ]; then warn "Gateway not reachable at $gateway_url" "Check gateway service: systemctl status gateway || docker ps" else warn "Gateway returned HTTP $status" "Check gateway logs" fi } # ── Check 4: Disk space ──────────────────────────── check_disk() { local usage usage=$(df / | tail -1 | awk '{print $5}' | tr -d '%') if [ "$usage" -lt 80 ]; then ok "Disk usage: ${usage}%" elif [ "$usage" -lt 90 ]; then warn "Disk usage: ${usage}%" "Clean up logs and temp files: journalctl --vacuum-size=100M" else fail "Disk usage CRITICAL: ${usage}%" "Emergency cleanup needed" "cleanup_disk" fi } # ── Check 5: SSL cert ────────────────────────────── check_ssl() { local domain="${1:-localhost}" local cert_dir="/etc/letsencrypt/live/$domain" if [ ! -d "$cert_dir" ]; then if [ "$VERBOSE" = 1 ]; then warn "No Let's Encrypt cert at $cert_dir" "Assuming self-signed or no SSL" fi return 0 fi if [ -f "$cert_dir/fullchain.pem" ]; then local expiry expiry=$(openssl x509 -enddate -noout -in "$cert_dir/fullchain.pem" 2>/dev/null | cut -d= -f2 || echo "unknown") if [ "$expiry" = "unknown" ]; then warn "Cannot read SSL cert expiry" "Check cert: openssl x509 -enddate -noout -in $cert_dir/fullchain.pem" return 0 fi local expiry_epoch expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null || date -j -f "%b %d %T %Y %Z" "$expiry" +%s 2>/dev/null || echo 0) local now_epoch now_epoch=$(date +%s) local days_left=$(( (expiry_epoch - now_epoch) / 86400 )) if [ "$days_left" -gt 30 ]; then ok "SSL cert expires in ${days_left} days ($expiry)" elif [ "$days_left" -gt 0 ]; then warn "SSL cert expires in ${days_left} days!" "Renew: certbot renew" else fail "SSL cert has EXPIRED" "Renew immediately: certbot renew --force-renewal" fi fi } # ── Recovery functions ────────────────────────────── restart_nginx() { info "Attempting to restart nginx..." if command -v systemctl > /dev/null 2>&1; then systemctl restart nginx && info "nginx restarted successfully" || warn "nginx restart failed" "Manual intervention needed" elif command -v nginx > /dev/null 2>&1; then nginx -s reload 2>/dev/null || (nginx && info "nginx started") || warn "nginx start failed" "Manual intervention needed" fi } cleanup_disk() { info "Running disk cleanup..." journalctl --vacuum-size=100M 2>/dev/null || true rm -rf /tmp/* 2>/dev/null || true rm -rf /var/log/*.gz 2>/dev/null || true info "Cleanup complete" } # ── Main ──────────────────────────────────────────── info "=== The Door Health Check ===" info "Host: ${HEALTH_HOST:-localhost}" info "Time: $(date)" echo "" check_nginx "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}" check_static "${HEALTH_HOST:-localhost}" "${HEALTH_PORT:-80}" check_gateway "${GATEWAY_URL:-http://localhost:8000}" check_disk check_ssl "${HEALTH_HOST:-localhost}" echo "" if [ "$WARNINGS" -gt 0 ] || [ "$HEALTHY" -gt 0 ]; then info "Summary: $HEALTHY OK, $WARNINGS warnings/failures" fi if [ "$WARNINGS" -gt 0 ] && [ "$AUTO_RESTART" = 1 ]; then warn "Auto-restart mode is ON — recovery actions attempted" exit 1 elif [ "$WARNINGS" -gt 0 ]; then exit 1 fi exit 0