Files
the-nexus/scripts/runner-health-probe.sh
2026-04-07 14:33:35 +00:00

191 lines
8.0 KiB
Bash

#!/usr/bin/env bash
# runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer)
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
#
# Called every 5 minutes by runner-health-probe.timer (systemd).
# Can also be run manually for immediate status.
#
# POKA-YOKE detection + correction:
# 1. Queries Gitea API for active runner count
# 2. Reports count to Timmy Time via journal/log every run
# 3. On ZERO active runners:
# a. Logs P1 alert to journal
# b. Creates alert marker file for external watchers
# c. Attempts to restart act_runner service (auto-correction)
# d. Re-queries after restart to verify recovery
#
# Exit codes:
# 0 — runners healthy (≥1 online runner)
# 1 — zero runners detected (P1 alert fired)
# 2 — Gitea API unreachable (network/config error)
set -uo pipefail
# ── Configuration ─────────────────────────────────────────────────────────────
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
GITEA_TOKEN="${GITEA_TOKEN:-}"
GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}"
ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}"
RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}"
# Restart cooldown: don't restart more than once per 10 minutes
COOLDOWN_FILE="${ALERT_DIR}/.last_restart"
COOLDOWN_SECS=600
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; }
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; }
alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; }
# Load token from file if not set via env
if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then
GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE")
fi
if [[ -z "$GITEA_TOKEN" ]]; then
warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}"
warn "Cannot query runner health without API token. Exiting."
exit 2
fi
mkdir -p "$ALERT_DIR"
# ── Query Gitea runner count ───────────────────────────────────────────────────
query_active_runners() {
local response http_code runner_count
# Fetch runners list — Gitea admin endpoint
response=$(curl -sf \
--max-time 15 \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Content-Type: application/json" \
-w "\n__HTTP_STATUS__%{http_code}" \
"${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || {
warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners"
return 2
}
http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+')
response=$(echo "$response" | sed '/^__HTTP_STATUS__/d')
if [[ "$http_code" != "200" ]]; then
warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)."
return 2
fi
# Count runners that are "online" or "active"
# Gitea runner status field: "online", "offline", "idle", "active"
runner_count=$(echo "$response" | \
python3 -c "
import sys, json
data = json.load(sys.stdin)
runners = data if isinstance(data, list) else data.get('runners', data.get('data', []))
online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')]
print(len(online))
" 2>/dev/null) || {
# Fallback: count all runners if status parse fails
runner_count=$(echo "$response" | \
python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0")
warn "Could not parse runner status — counting all runners: ${runner_count}"
}
echo "${runner_count:-0}"
return 0
}
# ── Cooldown check ────────────────────────────────────────────────────────────
in_cooldown() {
if [[ -f "$COOLDOWN_FILE" ]]; then
local last_restart now age
last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0)
now=$(date +%s)
age=$(( now - last_restart ))
if (( age < COOLDOWN_SECS )); then
log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt."
return 0
fi
fi
return 1
}
record_restart() {
date +%s > "$COOLDOWN_FILE"
}
# ── Main probe logic ───────────────────────────────────────────────────────────
log "Querying Gitea runner health at ${GITEA_URL}..."
RUNNER_COUNT=$(query_active_runners)
QUERY_EXIT=$?
if [[ $QUERY_EXIT -eq 2 ]]; then
warn "API unreachable — cannot assess runner health. Check network and token."
# Write an "unknown" alert marker so monitoring can see the probe itself is broken
echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log"
exit 2
fi
log "Active runner count: ${RUNNER_COUNT}"
# ── Healthy path ──────────────────────────────────────────────────────────────
if (( RUNNER_COUNT > 0 )); then
log "Runners OK. ${RUNNER_COUNT} active runner(s) online."
# Clear any stale P1 alert marker
rm -f "${ALERT_DIR}/p1-zero-runners.alert"
exit 0
fi
# ── Zero-runner P1 alert path ─────────────────────────────────────────────────
alert "ZERO active runners detected on ${GITEA_URL}!"
alert "All CI jobs will queue silently. Attempting auto-correction."
# Write P1 alert marker (watched by external monitoring, logs, etc.)
ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert"
cat > "$ALERT_FILE" <<ALERT_EOF
P1 ALERT — ZERO GITEA RUNNERS
Detected : $(date -Iseconds)
Host : $(hostname)
Gitea : ${GITEA_URL}
Impact : ALL CI jobs queuing silently — no runners available
Action : Auto-restart of ${RUNNER_SERVICE} attempted (see below)
ALERT_EOF
log "P1 alert written to ${ALERT_FILE}"
# ── Auto-correction: restart act_runner ───────────────────────────────────────
if in_cooldown; then
alert "Cannot attempt restart — cooldown active. Manual intervention may be required."
alert "Check: systemctl status ${RUNNER_SERVICE}"
alert "See alert file: ${ALERT_FILE}"
exit 1
fi
log "Attempting to restart ${RUNNER_SERVICE} service..."
if systemctl restart "$RUNNER_SERVICE" 2>&1; then
record_restart
log "Service restart issued. Waiting 15s for runner to register..."
sleep 15
# Re-query to verify recovery
RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0")
if (( RUNNER_COUNT_AFTER > 0 )); then
log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart."
# Append recovery note to alert file (leave file as audit trail)
echo "Recovered : $(date -Iseconds)${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE"
exit 0
else
alert "Recovery FAILED: still zero runners after restart."
alert "Manual intervention required."
alert "Next steps:"
alert " 1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'"
alert " 2. Verify registration token: ${GITEA_URL}/user/settings/applications"
alert " 3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token <new-token>"
echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE"
exit 1
fi
else
alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host."
alert "Verify act_runner is installed via provision-runner.sh."
echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE"
exit 1
fi