191 lines
8.0 KiB
Bash
191 lines
8.0 KiB
Bash
#!/usr/bin/env bash
|
|
# runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer)
|
|
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
|
|
#
|
|
# Called every 5 minutes by runner-health-probe.timer (systemd).
|
|
# Can also be run manually for immediate status.
|
|
#
|
|
# POKA-YOKE detection + correction:
|
|
# 1. Queries Gitea API for active runner count
|
|
# 2. Reports count to Timmy Time via journal/log every run
|
|
# 3. On ZERO active runners:
|
|
# a. Logs P1 alert to journal
|
|
# b. Creates alert marker file for external watchers
|
|
# c. Attempts to restart act_runner service (auto-correction)
|
|
# d. Re-queries after restart to verify recovery
|
|
#
|
|
# Exit codes:
|
|
# 0 — runners healthy (≥1 online runner)
|
|
# 1 — zero runners detected (P1 alert fired)
|
|
# 2 — Gitea API unreachable (network/config error)
|
|
|
|
set -uo pipefail
|
|
|
|
# ── Configuration ─────────────────────────────────────────────────────────────
|
|
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
|
GITEA_TOKEN="${GITEA_TOKEN:-}"
|
|
GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}"
|
|
ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}"
|
|
RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}"
|
|
# Restart cooldown: don't restart more than once per 10 minutes
|
|
COOLDOWN_FILE="${ALERT_DIR}/.last_restart"
|
|
COOLDOWN_SECS=600
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; }
|
|
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; }
|
|
alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; }
|
|
|
|
# Load token from file if not set via env
|
|
if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then
|
|
GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE")
|
|
fi
|
|
|
|
if [[ -z "$GITEA_TOKEN" ]]; then
|
|
warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}"
|
|
warn "Cannot query runner health without API token. Exiting."
|
|
exit 2
|
|
fi
|
|
|
|
mkdir -p "$ALERT_DIR"
|
|
|
|
# ── Query Gitea runner count ───────────────────────────────────────────────────
|
|
query_active_runners() {
|
|
local response http_code runner_count
|
|
|
|
# Fetch runners list — Gitea admin endpoint
|
|
response=$(curl -sf \
|
|
--max-time 15 \
|
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-w "\n__HTTP_STATUS__%{http_code}" \
|
|
"${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || {
|
|
warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners"
|
|
return 2
|
|
}
|
|
|
|
http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+')
|
|
response=$(echo "$response" | sed '/^__HTTP_STATUS__/d')
|
|
|
|
if [[ "$http_code" != "200" ]]; then
|
|
warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)."
|
|
return 2
|
|
fi
|
|
|
|
# Count runners that are "online" or "active"
|
|
# Gitea runner status field: "online", "offline", "idle", "active"
|
|
runner_count=$(echo "$response" | \
|
|
python3 -c "
|
|
import sys, json
|
|
data = json.load(sys.stdin)
|
|
runners = data if isinstance(data, list) else data.get('runners', data.get('data', []))
|
|
online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')]
|
|
print(len(online))
|
|
" 2>/dev/null) || {
|
|
# Fallback: count all runners if status parse fails
|
|
runner_count=$(echo "$response" | \
|
|
python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0")
|
|
warn "Could not parse runner status — counting all runners: ${runner_count}"
|
|
}
|
|
|
|
echo "${runner_count:-0}"
|
|
return 0
|
|
}
|
|
|
|
# ── Cooldown check ────────────────────────────────────────────────────────────
|
|
in_cooldown() {
|
|
if [[ -f "$COOLDOWN_FILE" ]]; then
|
|
local last_restart now age
|
|
last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0)
|
|
now=$(date +%s)
|
|
age=$(( now - last_restart ))
|
|
if (( age < COOLDOWN_SECS )); then
|
|
log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt."
|
|
return 0
|
|
fi
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
record_restart() {
|
|
date +%s > "$COOLDOWN_FILE"
|
|
}
|
|
|
|
# ── Main probe logic ───────────────────────────────────────────────────────────
|
|
log "Querying Gitea runner health at ${GITEA_URL}..."
|
|
|
|
RUNNER_COUNT=$(query_active_runners)
|
|
QUERY_EXIT=$?
|
|
|
|
if [[ $QUERY_EXIT -eq 2 ]]; then
|
|
warn "API unreachable — cannot assess runner health. Check network and token."
|
|
# Write an "unknown" alert marker so monitoring can see the probe itself is broken
|
|
echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log"
|
|
exit 2
|
|
fi
|
|
|
|
log "Active runner count: ${RUNNER_COUNT}"
|
|
|
|
# ── Healthy path ──────────────────────────────────────────────────────────────
|
|
if (( RUNNER_COUNT > 0 )); then
|
|
log "Runners OK. ${RUNNER_COUNT} active runner(s) online."
|
|
# Clear any stale P1 alert marker
|
|
rm -f "${ALERT_DIR}/p1-zero-runners.alert"
|
|
exit 0
|
|
fi
|
|
|
|
# ── Zero-runner P1 alert path ─────────────────────────────────────────────────
|
|
alert "ZERO active runners detected on ${GITEA_URL}!"
|
|
alert "All CI jobs will queue silently. Attempting auto-correction."
|
|
|
|
# Write P1 alert marker (watched by external monitoring, logs, etc.)
|
|
ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert"
|
|
cat > "$ALERT_FILE" <<ALERT_EOF
|
|
P1 ALERT — ZERO GITEA RUNNERS
|
|
Detected : $(date -Iseconds)
|
|
Host : $(hostname)
|
|
Gitea : ${GITEA_URL}
|
|
Impact : ALL CI jobs queuing silently — no runners available
|
|
Action : Auto-restart of ${RUNNER_SERVICE} attempted (see below)
|
|
ALERT_EOF
|
|
|
|
log "P1 alert written to ${ALERT_FILE}"
|
|
|
|
# ── Auto-correction: restart act_runner ───────────────────────────────────────
|
|
if in_cooldown; then
|
|
alert "Cannot attempt restart — cooldown active. Manual intervention may be required."
|
|
alert "Check: systemctl status ${RUNNER_SERVICE}"
|
|
alert "See alert file: ${ALERT_FILE}"
|
|
exit 1
|
|
fi
|
|
|
|
log "Attempting to restart ${RUNNER_SERVICE} service..."
|
|
if systemctl restart "$RUNNER_SERVICE" 2>&1; then
|
|
record_restart
|
|
log "Service restart issued. Waiting 15s for runner to register..."
|
|
sleep 15
|
|
|
|
# Re-query to verify recovery
|
|
RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0")
|
|
if (( RUNNER_COUNT_AFTER > 0 )); then
|
|
log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart."
|
|
# Append recovery note to alert file (leave file as audit trail)
|
|
echo "Recovered : $(date -Iseconds) — ${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE"
|
|
exit 0
|
|
else
|
|
alert "Recovery FAILED: still zero runners after restart."
|
|
alert "Manual intervention required."
|
|
alert "Next steps:"
|
|
alert " 1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'"
|
|
alert " 2. Verify registration token: ${GITEA_URL}/user/settings/applications"
|
|
alert " 3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token <new-token>"
|
|
echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE"
|
|
exit 1
|
|
fi
|
|
else
|
|
alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host."
|
|
alert "Verify act_runner is installed via provision-runner.sh."
|
|
echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE"
|
|
exit 1
|
|
fi
|