[claude] Poka-yoke runner health: provision + health probe scripts (#1097) #1101

Merged
claude merged 1 commits from claude/issue-1097 into main 2026-04-07 14:33:36 +00:00
4 changed files with 449 additions and 0 deletions

229
scripts/provision-runner.sh Normal file
View File

@@ -0,0 +1,229 @@
#!/usr/bin/env bash
# provision-runner.sh — VPS provisioning script for Gitea act_runner
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
#
# Usage (on Bezalel VPS as root):
# bash provision-runner.sh --gitea-url <url> --token <runner-registration-token>
#
# This script:
# 1. Downloads and installs act_runner binary
# 2. Registers the runner with the Gitea instance
# 3. Creates and enables systemd service for act_runner
# 4. Installs the runner-health-probe timer (poka-yoke detection layer)
#
# POKA-YOKE principles applied:
# Prevention: runner registration is mandatory — script exits non-zero if registration fails
# Detection: runner-health-probe.sh installed and enabled as part of this script
# Correction: health probe auto-restarts act_runner on zero-runner detection
set -euo pipefail
# ── Configuration defaults (override via env or flags) ───────────────────────
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
RUNNER_TOKEN="${RUNNER_TOKEN:-}"
RUNNER_NAME="${RUNNER_NAME:-$(hostname)-runner}"
RUNNER_LABELS="${RUNNER_LABELS:-ubuntu-latest,linux,x86_64}"
ACT_RUNNER_VERSION="${ACT_RUNNER_VERSION:-0.2.10}"
INSTALL_DIR="${INSTALL_DIR:-/usr/local/bin}"
CONFIG_DIR="${CONFIG_DIR:-/etc/act_runner}"
DATA_DIR="${DATA_DIR:-/var/lib/act_runner}"
NEXUS_DIR="${NEXUS_DIR:-/root/wizards/the-nexus}"
PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION: $*"; }
fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION ERROR: $*" >&2; exit 1; }
usage() {
cat <<EOF
Usage: provision-runner.sh [options]
Options:
--gitea-url <url> Gitea base URL (default: $GITEA_URL)
--token <token> Runner registration token (required)
--name <name> Runner name (default: hostname-runner)
--labels <labels> Comma-separated labels (default: $RUNNER_LABELS)
--version <ver> act_runner version to install (default: $ACT_RUNNER_VERSION)
--nexus-dir <path> Path to the-nexus checkout (default: $NEXUS_DIR)
--help Show this help
Environment variables: GITEA_URL, RUNNER_TOKEN, RUNNER_NAME, RUNNER_LABELS,
ACT_RUNNER_VERSION, NEXUS_DIR
POKA-YOKE CHECKLIST (enforced by this script):
[1] act_runner binary installed and executable
[2] Runner registered with Gitea (non-zero runner count verified)
[3] act_runner systemd service enabled and running
[4] runner-health-probe timer installed and enabled
EOF
}
# ── Argument parsing ──────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
case "$1" in
--gitea-url) GITEA_URL="$2"; shift 2 ;;
--token) RUNNER_TOKEN="$2"; shift 2 ;;
--name) RUNNER_NAME="$2"; shift 2 ;;
--labels) RUNNER_LABELS="$2"; shift 2 ;;
--version) ACT_RUNNER_VERSION="$2"; shift 2 ;;
--nexus-dir) NEXUS_DIR="$2"; PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"; shift 2 ;;
--help) usage; exit 0 ;;
*) fail "Unknown argument: $1. Use --help for usage." ;;
esac
done
[[ -z "$RUNNER_TOKEN" ]] && fail "Runner registration token required. Pass --token or set RUNNER_TOKEN env var."
# ── Step 1: Install act_runner binary ─────────────────────────────────────────
log "Step 1/4: Installing act_runner v${ACT_RUNNER_VERSION}..."
ARCH=$(uname -m)
case "$ARCH" in
x86_64) ARCH_SUFFIX="amd64" ;;
aarch64) ARCH_SUFFIX="arm64" ;;
*) fail "Unsupported architecture: $ARCH" ;;
esac
BINARY_URL="https://gitea.com/gitea/act_runner/releases/download/v${ACT_RUNNER_VERSION}/act_runner-${ACT_RUNNER_VERSION}-linux-${ARCH_SUFFIX}"
BINARY_PATH="${INSTALL_DIR}/act_runner"
if [[ -f "$BINARY_PATH" ]]; then
CURRENT_VER=$("$BINARY_PATH" --version 2>/dev/null | grep -oP '\d+\.\d+\.\d+' || echo "unknown")
if [[ "$CURRENT_VER" == "$ACT_RUNNER_VERSION" ]]; then
log "act_runner v${ACT_RUNNER_VERSION} already installed — skipping download."
else
log "Upgrading act_runner from v${CURRENT_VER} to v${ACT_RUNNER_VERSION}..."
curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
chmod +x "$BINARY_PATH"
fi
else
curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
chmod +x "$BINARY_PATH"
fi
"$BINARY_PATH" --version >/dev/null 2>&1 || fail "act_runner binary not functional after install."
log "act_runner binary OK: $($BINARY_PATH --version 2>/dev/null || echo 'installed')"
# ── Step 2: Register runner with Gitea ────────────────────────────────────────
log "Step 2/4: Registering runner with Gitea at ${GITEA_URL}..."
mkdir -p "$CONFIG_DIR" "$DATA_DIR"
CONFIG_FILE="${CONFIG_DIR}/config.yaml"
# Generate config and register
"$BINARY_PATH" register \
--no-interactive \
--instance "$GITEA_URL" \
--token "$RUNNER_TOKEN" \
--name "$RUNNER_NAME" \
--labels "$RUNNER_LABELS" \
--config "$CONFIG_FILE" \
2>&1 | tee /tmp/act_runner_register.log
if ! grep -q "Runner registered" /tmp/act_runner_register.log 2>/dev/null && \
! grep -q "registered" /tmp/act_runner_register.log 2>/dev/null; then
# Registration output varies — check if config was written as a fallback signal
if [[ ! -f "$CONFIG_FILE" ]]; then
fail "Runner registration failed. Check token and Gitea URL. Log: /tmp/act_runner_register.log"
fi
fi
log "Runner registered. Config written to ${CONFIG_FILE}"
# ── Step 3: Create and enable systemd service ─────────────────────────────────
log "Step 3/4: Installing act_runner systemd service..."
cat > /etc/systemd/system/act_runner.service <<EOF
[Unit]
Description=Gitea Actions Runner (act_runner)
Documentation=https://gitea.com/gitea/act_runner
After=network.target
Wants=network-online.target
[Service]
Type=simple
User=root
WorkingDirectory=${DATA_DIR}
ExecStart=${INSTALL_DIR}/act_runner daemon --config ${CONFIG_FILE}
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
Environment=HOME=/root
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable act_runner
systemctl restart act_runner
sleep 3
if ! systemctl is-active --quiet act_runner; then
fail "act_runner service failed to start. Check: journalctl -u act_runner -n 50"
fi
log "act_runner service running."
# ── Step 4: Install runner health probe ───────────────────────────────────────
log "Step 4/4: Installing runner-health-probe systemd timer..."
if [[ ! -f "$PROBE_SCRIPT" ]]; then
log "WARNING: probe script not found at ${PROBE_SCRIPT}. Skipping timer install."
log " Re-run after the-nexus is checked out to: ${NEXUS_DIR}"
log " Then manually: systemctl enable --now runner-health-probe.timer"
else
chmod +x "$PROBE_SCRIPT"
# Install service unit
cat > /etc/systemd/system/runner-health-probe.service <<EOF
[Unit]
Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
After=network.target act_runner.service
[Service]
Type=oneshot
ExecStart=${PROBE_SCRIPT}
StandardOutput=journal
StandardError=journal
Environment=HOME=/root
EOF
# Install timer unit (every 5 minutes)
cat > /etc/systemd/system/runner-health-probe.timer <<EOF
[Unit]
Description=Gitea Runner Health Probe — every 5 minutes (poka-yoke #1097)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
[Timer]
OnBootSec=2min
OnUnitActiveSec=5min
Persistent=true
[Install]
WantedBy=timers.target
EOF
systemctl daemon-reload
systemctl enable --now runner-health-probe.timer
log "runner-health-probe.timer enabled (fires every 5 minutes)."
fi
# ── Poka-yoke checklist summary ───────────────────────────────────────────────
echo ""
echo "══════════════════════════════════════════════════════════"
echo " POKA-YOKE PROVISIONING CHECKLIST — $(hostname)"
echo "══════════════════════════════════════════════════════════"
printf " [1] act_runner binary : "
"$BINARY_PATH" --version >/dev/null 2>&1 && echo "OK" || echo "FAIL"
printf " [2] act_runner registered : "
[[ -f "$CONFIG_FILE" ]] && echo "OK (config exists)" || echo "FAIL (no config)"
printf " [3] act_runner service : "
systemctl is-active --quiet act_runner && echo "RUNNING" || echo "FAIL"
printf " [4] health-probe timer : "
systemctl is-active --quiet runner-health-probe.timer 2>/dev/null && echo "ACTIVE" || echo "NOT INSTALLED (re-run after nexus checkout)"
echo "══════════════════════════════════════════════════════════"
echo ""
log "Provisioning complete. Runner '${RUNNER_NAME}' registered at ${GITEA_URL}"

View File

@@ -0,0 +1,190 @@
#!/usr/bin/env bash
# runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer)
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
#
# Called every 5 minutes by runner-health-probe.timer (systemd).
# Can also be run manually for immediate status.
#
# POKA-YOKE detection + correction:
# 1. Queries Gitea API for active runner count
# 2. Reports count to Timmy Time via journal/log every run
# 3. On ZERO active runners:
# a. Logs P1 alert to journal
# b. Creates alert marker file for external watchers
# c. Attempts to restart act_runner service (auto-correction)
# d. Re-queries after restart to verify recovery
#
# Exit codes:
# 0 — runners healthy (≥1 online runner)
# 1 — zero runners detected (P1 alert fired)
# 2 — Gitea API unreachable (network/config error)
set -uo pipefail
# ── Configuration ─────────────────────────────────────────────────────────────
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
GITEA_TOKEN="${GITEA_TOKEN:-}"
GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}"
ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}"
RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}"
# Restart cooldown: don't restart more than once per 10 minutes
COOLDOWN_FILE="${ALERT_DIR}/.last_restart"
COOLDOWN_SECS=600
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; }
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; }
alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; }
# Load token from file if not set via env
if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then
GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE")
fi
if [[ -z "$GITEA_TOKEN" ]]; then
warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}"
warn "Cannot query runner health without API token. Exiting."
exit 2
fi
mkdir -p "$ALERT_DIR"
# ── Query Gitea runner count ───────────────────────────────────────────────────
query_active_runners() {
local response http_code runner_count
# Fetch runners list — Gitea admin endpoint
response=$(curl -sf \
--max-time 15 \
-H "Authorization: token ${GITEA_TOKEN}" \
-H "Content-Type: application/json" \
-w "\n__HTTP_STATUS__%{http_code}" \
"${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || {
warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners"
return 2
}
http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+')
response=$(echo "$response" | sed '/^__HTTP_STATUS__/d')
if [[ "$http_code" != "200" ]]; then
warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)."
return 2
fi
# Count runners that are "online" or "active"
# Gitea runner status field: "online", "offline", "idle", "active"
runner_count=$(echo "$response" | \
python3 -c "
import sys, json
data = json.load(sys.stdin)
runners = data if isinstance(data, list) else data.get('runners', data.get('data', []))
online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')]
print(len(online))
" 2>/dev/null) || {
# Fallback: count all runners if status parse fails
runner_count=$(echo "$response" | \
python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0")
warn "Could not parse runner status — counting all runners: ${runner_count}"
}
echo "${runner_count:-0}"
return 0
}
# ── Cooldown check ────────────────────────────────────────────────────────────
in_cooldown() {
if [[ -f "$COOLDOWN_FILE" ]]; then
local last_restart now age
last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0)
now=$(date +%s)
age=$(( now - last_restart ))
if (( age < COOLDOWN_SECS )); then
log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt."
return 0
fi
fi
return 1
}
record_restart() {
date +%s > "$COOLDOWN_FILE"
}
# ── Main probe logic ───────────────────────────────────────────────────────────
log "Querying Gitea runner health at ${GITEA_URL}..."
RUNNER_COUNT=$(query_active_runners)
QUERY_EXIT=$?
if [[ $QUERY_EXIT -eq 2 ]]; then
warn "API unreachable — cannot assess runner health. Check network and token."
# Write an "unknown" alert marker so monitoring can see the probe itself is broken
echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log"
exit 2
fi
log "Active runner count: ${RUNNER_COUNT}"
# ── Healthy path ──────────────────────────────────────────────────────────────
if (( RUNNER_COUNT > 0 )); then
log "Runners OK. ${RUNNER_COUNT} active runner(s) online."
# Clear any stale P1 alert marker
rm -f "${ALERT_DIR}/p1-zero-runners.alert"
exit 0
fi
# ── Zero-runner P1 alert path ─────────────────────────────────────────────────
alert "ZERO active runners detected on ${GITEA_URL}!"
alert "All CI jobs will queue silently. Attempting auto-correction."
# Write P1 alert marker (watched by external monitoring, logs, etc.)
ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert"
cat > "$ALERT_FILE" <<ALERT_EOF
P1 ALERT — ZERO GITEA RUNNERS
Detected : $(date -Iseconds)
Host : $(hostname)
Gitea : ${GITEA_URL}
Impact : ALL CI jobs queuing silently — no runners available
Action : Auto-restart of ${RUNNER_SERVICE} attempted (see below)
ALERT_EOF
log "P1 alert written to ${ALERT_FILE}"
# ── Auto-correction: restart act_runner ───────────────────────────────────────
if in_cooldown; then
alert "Cannot attempt restart — cooldown active. Manual intervention may be required."
alert "Check: systemctl status ${RUNNER_SERVICE}"
alert "See alert file: ${ALERT_FILE}"
exit 1
fi
log "Attempting to restart ${RUNNER_SERVICE} service..."
if systemctl restart "$RUNNER_SERVICE" 2>&1; then
record_restart
log "Service restart issued. Waiting 15s for runner to register..."
sleep 15
# Re-query to verify recovery
RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0")
if (( RUNNER_COUNT_AFTER > 0 )); then
log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart."
# Append recovery note to alert file (leave file as audit trail)
echo "Recovered : $(date -Iseconds)${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE"
exit 0
else
alert "Recovery FAILED: still zero runners after restart."
alert "Manual intervention required."
alert "Next steps:"
alert " 1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'"
alert " 2. Verify registration token: ${GITEA_URL}/user/settings/applications"
alert " 3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token <new-token>"
echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE"
exit 1
fi
else
alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host."
alert "Verify act_runner is installed via provision-runner.sh."
echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE"
exit 1
fi

View File

@@ -0,0 +1,16 @@
[Unit]
Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
After=network.target act_runner.service
[Service]
Type=oneshot
ExecStart=/root/wizards/the-nexus/scripts/runner-health-probe.sh
StandardOutput=journal
StandardError=journal
Environment=HOME=/root
# Token can be set here or via /etc/act_runner/gitea-probe-token file
# EnvironmentFile=/etc/act_runner/probe.env
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,14 @@
[Unit]
Description=Gitea Runner Health Probe — fires every 5 minutes (poka-yoke #1097)
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
[Timer]
# Start 2 minutes after boot (let network and act_runner settle)
OnBootSec=2min
# Then fire every 5 minutes
OnUnitActiveSec=5min
# Re-fire missed runs after downtime
Persistent=true
[Install]
WantedBy=timers.target