#!/usr/bin/env bash # runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer) # Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss) # # Called every 5 minutes by runner-health-probe.timer (systemd). # Can also be run manually for immediate status. # # POKA-YOKE detection + correction: # 1. Queries Gitea API for active runner count # 2. Reports count to Timmy Time via journal/log every run # 3. On ZERO active runners: # a. Logs P1 alert to journal # b. Creates alert marker file for external watchers # c. Attempts to restart act_runner service (auto-correction) # d. Re-queries after restart to verify recovery # # Exit codes: # 0 — runners healthy (≥1 online runner) # 1 — zero runners detected (P1 alert fired) # 2 — Gitea API unreachable (network/config error) set -uo pipefail # ── Configuration ───────────────────────────────────────────────────────────── GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}" GITEA_TOKEN="${GITEA_TOKEN:-}" GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}" ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}" RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}" # Restart cooldown: don't restart more than once per 10 minutes COOLDOWN_FILE="${ALERT_DIR}/.last_restart" COOLDOWN_SECS=600 # ── Helpers ─────────────────────────────────────────────────────────────────── log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; } warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; } alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; } # Load token from file if not set via env if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE") fi if [[ -z "$GITEA_TOKEN" ]]; then warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}" warn "Cannot query runner health without API token. Exiting." exit 2 fi mkdir -p "$ALERT_DIR" # ── Query Gitea runner count ─────────────────────────────────────────────────── query_active_runners() { local response http_code runner_count # Fetch runners list — Gitea admin endpoint response=$(curl -sf \ --max-time 15 \ -H "Authorization: token ${GITEA_TOKEN}" \ -H "Content-Type: application/json" \ -w "\n__HTTP_STATUS__%{http_code}" \ "${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || { warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners" return 2 } http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+') response=$(echo "$response" | sed '/^__HTTP_STATUS__/d') if [[ "$http_code" != "200" ]]; then warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)." return 2 fi # Count runners that are "online" or "active" # Gitea runner status field: "online", "offline", "idle", "active" runner_count=$(echo "$response" | \ python3 -c " import sys, json data = json.load(sys.stdin) runners = data if isinstance(data, list) else data.get('runners', data.get('data', [])) online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')] print(len(online)) " 2>/dev/null) || { # Fallback: count all runners if status parse fails runner_count=$(echo "$response" | \ python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0") warn "Could not parse runner status — counting all runners: ${runner_count}" } echo "${runner_count:-0}" return 0 } # ── Cooldown check ──────────────────────────────────────────────────────────── in_cooldown() { if [[ -f "$COOLDOWN_FILE" ]]; then local last_restart now age last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0) now=$(date +%s) age=$(( now - last_restart )) if (( age < COOLDOWN_SECS )); then log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt." return 0 fi fi return 1 } record_restart() { date +%s > "$COOLDOWN_FILE" } # ── Main probe logic ─────────────────────────────────────────────────────────── log "Querying Gitea runner health at ${GITEA_URL}..." RUNNER_COUNT=$(query_active_runners) QUERY_EXIT=$? if [[ $QUERY_EXIT -eq 2 ]]; then warn "API unreachable — cannot assess runner health. Check network and token." # Write an "unknown" alert marker so monitoring can see the probe itself is broken echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log" exit 2 fi log "Active runner count: ${RUNNER_COUNT}" # ── Healthy path ────────────────────────────────────────────────────────────── if (( RUNNER_COUNT > 0 )); then log "Runners OK. ${RUNNER_COUNT} active runner(s) online." # Clear any stale P1 alert marker rm -f "${ALERT_DIR}/p1-zero-runners.alert" exit 0 fi # ── Zero-runner P1 alert path ───────────────────────────────────────────────── alert "ZERO active runners detected on ${GITEA_URL}!" alert "All CI jobs will queue silently. Attempting auto-correction." # Write P1 alert marker (watched by external monitoring, logs, etc.) ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert" cat > "$ALERT_FILE" <&1; then record_restart log "Service restart issued. Waiting 15s for runner to register..." sleep 15 # Re-query to verify recovery RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0") if (( RUNNER_COUNT_AFTER > 0 )); then log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart." # Append recovery note to alert file (leave file as audit trail) echo "Recovered : $(date -Iseconds) — ${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE" exit 0 else alert "Recovery FAILED: still zero runners after restart." alert "Manual intervention required." alert "Next steps:" alert " 1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'" alert " 2. Verify registration token: ${GITEA_URL}/user/settings/applications" alert " 3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token " echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE" exit 1 fi else alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host." alert "Verify act_runner is installed via provision-runner.sh." echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE" exit 1 fi