[claude] Poka-yoke runner health: provision + health probe scripts (#1097) (#1101)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
This commit was merged in pull request #1101.
This commit is contained in:
229
scripts/provision-runner.sh
Normal file
229
scripts/provision-runner.sh
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# provision-runner.sh — VPS provisioning script for Gitea act_runner
|
||||||
|
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
|
||||||
|
#
|
||||||
|
# Usage (on Bezalel VPS as root):
|
||||||
|
# bash provision-runner.sh --gitea-url <url> --token <runner-registration-token>
|
||||||
|
#
|
||||||
|
# This script:
|
||||||
|
# 1. Downloads and installs act_runner binary
|
||||||
|
# 2. Registers the runner with the Gitea instance
|
||||||
|
# 3. Creates and enables systemd service for act_runner
|
||||||
|
# 4. Installs the runner-health-probe timer (poka-yoke detection layer)
|
||||||
|
#
|
||||||
|
# POKA-YOKE principles applied:
|
||||||
|
# Prevention: runner registration is mandatory — script exits non-zero if registration fails
|
||||||
|
# Detection: runner-health-probe.sh installed and enabled as part of this script
|
||||||
|
# Correction: health probe auto-restarts act_runner on zero-runner detection
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ── Configuration defaults (override via env or flags) ───────────────────────
|
||||||
|
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||||
|
RUNNER_TOKEN="${RUNNER_TOKEN:-}"
|
||||||
|
RUNNER_NAME="${RUNNER_NAME:-$(hostname)-runner}"
|
||||||
|
RUNNER_LABELS="${RUNNER_LABELS:-ubuntu-latest,linux,x86_64}"
|
||||||
|
ACT_RUNNER_VERSION="${ACT_RUNNER_VERSION:-0.2.10}"
|
||||||
|
INSTALL_DIR="${INSTALL_DIR:-/usr/local/bin}"
|
||||||
|
CONFIG_DIR="${CONFIG_DIR:-/etc/act_runner}"
|
||||||
|
DATA_DIR="${DATA_DIR:-/var/lib/act_runner}"
|
||||||
|
NEXUS_DIR="${NEXUS_DIR:-/root/wizards/the-nexus}"
|
||||||
|
PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION: $*"; }
|
||||||
|
fail() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] PROVISION ERROR: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOF
|
||||||
|
Usage: provision-runner.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--gitea-url <url> Gitea base URL (default: $GITEA_URL)
|
||||||
|
--token <token> Runner registration token (required)
|
||||||
|
--name <name> Runner name (default: hostname-runner)
|
||||||
|
--labels <labels> Comma-separated labels (default: $RUNNER_LABELS)
|
||||||
|
--version <ver> act_runner version to install (default: $ACT_RUNNER_VERSION)
|
||||||
|
--nexus-dir <path> Path to the-nexus checkout (default: $NEXUS_DIR)
|
||||||
|
--help Show this help
|
||||||
|
|
||||||
|
Environment variables: GITEA_URL, RUNNER_TOKEN, RUNNER_NAME, RUNNER_LABELS,
|
||||||
|
ACT_RUNNER_VERSION, NEXUS_DIR
|
||||||
|
|
||||||
|
POKA-YOKE CHECKLIST (enforced by this script):
|
||||||
|
[1] act_runner binary installed and executable
|
||||||
|
[2] Runner registered with Gitea (non-zero runner count verified)
|
||||||
|
[3] act_runner systemd service enabled and running
|
||||||
|
[4] runner-health-probe timer installed and enabled
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Argument parsing ──────────────────────────────────────────────────────────
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--gitea-url) GITEA_URL="$2"; shift 2 ;;
|
||||||
|
--token) RUNNER_TOKEN="$2"; shift 2 ;;
|
||||||
|
--name) RUNNER_NAME="$2"; shift 2 ;;
|
||||||
|
--labels) RUNNER_LABELS="$2"; shift 2 ;;
|
||||||
|
--version) ACT_RUNNER_VERSION="$2"; shift 2 ;;
|
||||||
|
--nexus-dir) NEXUS_DIR="$2"; PROBE_SCRIPT="${NEXUS_DIR}/scripts/runner-health-probe.sh"; shift 2 ;;
|
||||||
|
--help) usage; exit 0 ;;
|
||||||
|
*) fail "Unknown argument: $1. Use --help for usage." ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ -z "$RUNNER_TOKEN" ]] && fail "Runner registration token required. Pass --token or set RUNNER_TOKEN env var."
|
||||||
|
|
||||||
|
# ── Step 1: Install act_runner binary ─────────────────────────────────────────
|
||||||
|
log "Step 1/4: Installing act_runner v${ACT_RUNNER_VERSION}..."
|
||||||
|
|
||||||
|
ARCH=$(uname -m)
|
||||||
|
case "$ARCH" in
|
||||||
|
x86_64) ARCH_SUFFIX="amd64" ;;
|
||||||
|
aarch64) ARCH_SUFFIX="arm64" ;;
|
||||||
|
*) fail "Unsupported architecture: $ARCH" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
BINARY_URL="https://gitea.com/gitea/act_runner/releases/download/v${ACT_RUNNER_VERSION}/act_runner-${ACT_RUNNER_VERSION}-linux-${ARCH_SUFFIX}"
|
||||||
|
BINARY_PATH="${INSTALL_DIR}/act_runner"
|
||||||
|
|
||||||
|
if [[ -f "$BINARY_PATH" ]]; then
|
||||||
|
CURRENT_VER=$("$BINARY_PATH" --version 2>/dev/null | grep -oP '\d+\.\d+\.\d+' || echo "unknown")
|
||||||
|
if [[ "$CURRENT_VER" == "$ACT_RUNNER_VERSION" ]]; then
|
||||||
|
log "act_runner v${ACT_RUNNER_VERSION} already installed — skipping download."
|
||||||
|
else
|
||||||
|
log "Upgrading act_runner from v${CURRENT_VER} to v${ACT_RUNNER_VERSION}..."
|
||||||
|
curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
|
||||||
|
chmod +x "$BINARY_PATH"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
curl -fsSL "$BINARY_URL" -o "$BINARY_PATH"
|
||||||
|
chmod +x "$BINARY_PATH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
"$BINARY_PATH" --version >/dev/null 2>&1 || fail "act_runner binary not functional after install."
|
||||||
|
log "act_runner binary OK: $($BINARY_PATH --version 2>/dev/null || echo 'installed')"
|
||||||
|
|
||||||
|
# ── Step 2: Register runner with Gitea ────────────────────────────────────────
|
||||||
|
log "Step 2/4: Registering runner with Gitea at ${GITEA_URL}..."
|
||||||
|
|
||||||
|
mkdir -p "$CONFIG_DIR" "$DATA_DIR"
|
||||||
|
|
||||||
|
CONFIG_FILE="${CONFIG_DIR}/config.yaml"
|
||||||
|
|
||||||
|
# Generate config and register
|
||||||
|
"$BINARY_PATH" register \
|
||||||
|
--no-interactive \
|
||||||
|
--instance "$GITEA_URL" \
|
||||||
|
--token "$RUNNER_TOKEN" \
|
||||||
|
--name "$RUNNER_NAME" \
|
||||||
|
--labels "$RUNNER_LABELS" \
|
||||||
|
--config "$CONFIG_FILE" \
|
||||||
|
2>&1 | tee /tmp/act_runner_register.log
|
||||||
|
|
||||||
|
if ! grep -q "Runner registered" /tmp/act_runner_register.log 2>/dev/null && \
|
||||||
|
! grep -q "registered" /tmp/act_runner_register.log 2>/dev/null; then
|
||||||
|
# Registration output varies — check if config was written as a fallback signal
|
||||||
|
if [[ ! -f "$CONFIG_FILE" ]]; then
|
||||||
|
fail "Runner registration failed. Check token and Gitea URL. Log: /tmp/act_runner_register.log"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Runner registered. Config written to ${CONFIG_FILE}"
|
||||||
|
|
||||||
|
# ── Step 3: Create and enable systemd service ─────────────────────────────────
|
||||||
|
log "Step 3/4: Installing act_runner systemd service..."
|
||||||
|
|
||||||
|
cat > /etc/systemd/system/act_runner.service <<EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Gitea Actions Runner (act_runner)
|
||||||
|
Documentation=https://gitea.com/gitea/act_runner
|
||||||
|
After=network.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=${DATA_DIR}
|
||||||
|
ExecStart=${INSTALL_DIR}/act_runner daemon --config ${CONFIG_FILE}
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
Environment=HOME=/root
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable act_runner
|
||||||
|
systemctl restart act_runner
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
if ! systemctl is-active --quiet act_runner; then
|
||||||
|
fail "act_runner service failed to start. Check: journalctl -u act_runner -n 50"
|
||||||
|
fi
|
||||||
|
log "act_runner service running."
|
||||||
|
|
||||||
|
# ── Step 4: Install runner health probe ───────────────────────────────────────
|
||||||
|
log "Step 4/4: Installing runner-health-probe systemd timer..."
|
||||||
|
|
||||||
|
if [[ ! -f "$PROBE_SCRIPT" ]]; then
|
||||||
|
log "WARNING: probe script not found at ${PROBE_SCRIPT}. Skipping timer install."
|
||||||
|
log " Re-run after the-nexus is checked out to: ${NEXUS_DIR}"
|
||||||
|
log " Then manually: systemctl enable --now runner-health-probe.timer"
|
||||||
|
else
|
||||||
|
chmod +x "$PROBE_SCRIPT"
|
||||||
|
|
||||||
|
# Install service unit
|
||||||
|
cat > /etc/systemd/system/runner-health-probe.service <<EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
|
||||||
|
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
|
||||||
|
After=network.target act_runner.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=${PROBE_SCRIPT}
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
Environment=HOME=/root
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Install timer unit (every 5 minutes)
|
||||||
|
cat > /etc/systemd/system/runner-health-probe.timer <<EOF
|
||||||
|
[Unit]
|
||||||
|
Description=Gitea Runner Health Probe — every 5 minutes (poka-yoke #1097)
|
||||||
|
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=2min
|
||||||
|
OnUnitActiveSec=5min
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable --now runner-health-probe.timer
|
||||||
|
log "runner-health-probe.timer enabled (fires every 5 minutes)."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Poka-yoke checklist summary ───────────────────────────────────────────────
|
||||||
|
echo ""
|
||||||
|
echo "══════════════════════════════════════════════════════════"
|
||||||
|
echo " POKA-YOKE PROVISIONING CHECKLIST — $(hostname)"
|
||||||
|
echo "══════════════════════════════════════════════════════════"
|
||||||
|
printf " [1] act_runner binary : "
|
||||||
|
"$BINARY_PATH" --version >/dev/null 2>&1 && echo "OK" || echo "FAIL"
|
||||||
|
printf " [2] act_runner registered : "
|
||||||
|
[[ -f "$CONFIG_FILE" ]] && echo "OK (config exists)" || echo "FAIL (no config)"
|
||||||
|
printf " [3] act_runner service : "
|
||||||
|
systemctl is-active --quiet act_runner && echo "RUNNING" || echo "FAIL"
|
||||||
|
printf " [4] health-probe timer : "
|
||||||
|
systemctl is-active --quiet runner-health-probe.timer 2>/dev/null && echo "ACTIVE" || echo "NOT INSTALLED (re-run after nexus checkout)"
|
||||||
|
echo "══════════════════════════════════════════════════════════"
|
||||||
|
echo ""
|
||||||
|
log "Provisioning complete. Runner '${RUNNER_NAME}' registered at ${GITEA_URL}"
|
||||||
190
scripts/runner-health-probe.sh
Normal file
190
scripts/runner-health-probe.sh
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# runner-health-probe.sh — Gitea Runner Health Probe (poka-yoke detection layer)
|
||||||
|
# Refs: #1097 (POKA-YOKE: Make unregistered runners impossible to miss)
|
||||||
|
#
|
||||||
|
# Called every 5 minutes by runner-health-probe.timer (systemd).
|
||||||
|
# Can also be run manually for immediate status.
|
||||||
|
#
|
||||||
|
# POKA-YOKE detection + correction:
|
||||||
|
# 1. Queries Gitea API for active runner count
|
||||||
|
# 2. Reports count to Timmy Time via journal/log every run
|
||||||
|
# 3. On ZERO active runners:
|
||||||
|
# a. Logs P1 alert to journal
|
||||||
|
# b. Creates alert marker file for external watchers
|
||||||
|
# c. Attempts to restart act_runner service (auto-correction)
|
||||||
|
# d. Re-queries after restart to verify recovery
|
||||||
|
#
|
||||||
|
# Exit codes:
|
||||||
|
# 0 — runners healthy (≥1 online runner)
|
||||||
|
# 1 — zero runners detected (P1 alert fired)
|
||||||
|
# 2 — Gitea API unreachable (network/config error)
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
# ── Configuration ─────────────────────────────────────────────────────────────
|
||||||
|
GITEA_URL="${GITEA_URL:-https://forge.alexanderwhitestone.com}"
|
||||||
|
GITEA_TOKEN="${GITEA_TOKEN:-}"
|
||||||
|
GITEA_TOKEN_FILE="${GITEA_TOKEN_FILE:-/etc/act_runner/gitea-probe-token}"
|
||||||
|
ALERT_DIR="${ALERT_DIR:-/var/lib/act_runner/alerts}"
|
||||||
|
RUNNER_SERVICE="${RUNNER_SERVICE:-act_runner}"
|
||||||
|
# Restart cooldown: don't restart more than once per 10 minutes
|
||||||
|
COOLDOWN_FILE="${ALERT_DIR}/.last_restart"
|
||||||
|
COOLDOWN_SECS=600
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE: $*"; }
|
||||||
|
warn() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE WARNING: $*" >&2; }
|
||||||
|
alert(){ echo "[$(date '+%Y-%m-%d %H:%M:%S')] RUNNER-PROBE P1-ALERT: $*" >&2; }
|
||||||
|
|
||||||
|
# Load token from file if not set via env
|
||||||
|
if [[ -z "$GITEA_TOKEN" && -f "$GITEA_TOKEN_FILE" ]]; then
|
||||||
|
GITEA_TOKEN=$(cat "$GITEA_TOKEN_FILE")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$GITEA_TOKEN" ]]; then
|
||||||
|
warn "No Gitea API token configured. Set GITEA_TOKEN env var or write to ${GITEA_TOKEN_FILE}"
|
||||||
|
warn "Cannot query runner health without API token. Exiting."
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$ALERT_DIR"
|
||||||
|
|
||||||
|
# ── Query Gitea runner count ───────────────────────────────────────────────────
|
||||||
|
query_active_runners() {
|
||||||
|
local response http_code runner_count
|
||||||
|
|
||||||
|
# Fetch runners list — Gitea admin endpoint
|
||||||
|
response=$(curl -sf \
|
||||||
|
--max-time 15 \
|
||||||
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-w "\n__HTTP_STATUS__%{http_code}" \
|
||||||
|
"${GITEA_URL}/api/v1/admin/runners?limit=50" 2>/dev/null) || {
|
||||||
|
warn "Gitea API request failed (curl error). URL: ${GITEA_URL}/api/v1/admin/runners"
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
http_code=$(echo "$response" | grep -oP '(?<=__HTTP_STATUS__)\d+')
|
||||||
|
response=$(echo "$response" | sed '/^__HTTP_STATUS__/d')
|
||||||
|
|
||||||
|
if [[ "$http_code" != "200" ]]; then
|
||||||
|
warn "Gitea API returned HTTP ${http_code}. Check token permissions (requires admin)."
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Count runners that are "online" or "active"
|
||||||
|
# Gitea runner status field: "online", "offline", "idle", "active"
|
||||||
|
runner_count=$(echo "$response" | \
|
||||||
|
python3 -c "
|
||||||
|
import sys, json
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
runners = data if isinstance(data, list) else data.get('runners', data.get('data', []))
|
||||||
|
online = [r for r in runners if r.get('status') in ('online', 'idle', 'active')]
|
||||||
|
print(len(online))
|
||||||
|
" 2>/dev/null) || {
|
||||||
|
# Fallback: count all runners if status parse fails
|
||||||
|
runner_count=$(echo "$response" | \
|
||||||
|
python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d) if isinstance(d,list) else len(d.get('runners',d.get('data',[]))))" 2>/dev/null || echo "0")
|
||||||
|
warn "Could not parse runner status — counting all runners: ${runner_count}"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "${runner_count:-0}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Cooldown check ────────────────────────────────────────────────────────────
|
||||||
|
in_cooldown() {
|
||||||
|
if [[ -f "$COOLDOWN_FILE" ]]; then
|
||||||
|
local last_restart now age
|
||||||
|
last_restart=$(cat "$COOLDOWN_FILE" 2>/dev/null || echo 0)
|
||||||
|
now=$(date +%s)
|
||||||
|
age=$(( now - last_restart ))
|
||||||
|
if (( age < COOLDOWN_SECS )); then
|
||||||
|
log "Restart cooldown active (${age}s < ${COOLDOWN_SECS}s). Skipping restart attempt."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
record_restart() {
|
||||||
|
date +%s > "$COOLDOWN_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Main probe logic ───────────────────────────────────────────────────────────
|
||||||
|
log "Querying Gitea runner health at ${GITEA_URL}..."
|
||||||
|
|
||||||
|
RUNNER_COUNT=$(query_active_runners)
|
||||||
|
QUERY_EXIT=$?
|
||||||
|
|
||||||
|
if [[ $QUERY_EXIT -eq 2 ]]; then
|
||||||
|
warn "API unreachable — cannot assess runner health. Check network and token."
|
||||||
|
# Write an "unknown" alert marker so monitoring can see the probe itself is broken
|
||||||
|
echo "$(date -Iseconds) PROBE_ERROR: API unreachable" >> "${ALERT_DIR}/probe-errors.log"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Active runner count: ${RUNNER_COUNT}"
|
||||||
|
|
||||||
|
# ── Healthy path ──────────────────────────────────────────────────────────────
|
||||||
|
if (( RUNNER_COUNT > 0 )); then
|
||||||
|
log "Runners OK. ${RUNNER_COUNT} active runner(s) online."
|
||||||
|
# Clear any stale P1 alert marker
|
||||||
|
rm -f "${ALERT_DIR}/p1-zero-runners.alert"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Zero-runner P1 alert path ─────────────────────────────────────────────────
|
||||||
|
alert "ZERO active runners detected on ${GITEA_URL}!"
|
||||||
|
alert "All CI jobs will queue silently. Attempting auto-correction."
|
||||||
|
|
||||||
|
# Write P1 alert marker (watched by external monitoring, logs, etc.)
|
||||||
|
ALERT_FILE="${ALERT_DIR}/p1-zero-runners.alert"
|
||||||
|
cat > "$ALERT_FILE" <<ALERT_EOF
|
||||||
|
P1 ALERT — ZERO GITEA RUNNERS
|
||||||
|
Detected : $(date -Iseconds)
|
||||||
|
Host : $(hostname)
|
||||||
|
Gitea : ${GITEA_URL}
|
||||||
|
Impact : ALL CI jobs queuing silently — no runners available
|
||||||
|
Action : Auto-restart of ${RUNNER_SERVICE} attempted (see below)
|
||||||
|
ALERT_EOF
|
||||||
|
|
||||||
|
log "P1 alert written to ${ALERT_FILE}"
|
||||||
|
|
||||||
|
# ── Auto-correction: restart act_runner ───────────────────────────────────────
|
||||||
|
if in_cooldown; then
|
||||||
|
alert "Cannot attempt restart — cooldown active. Manual intervention may be required."
|
||||||
|
alert "Check: systemctl status ${RUNNER_SERVICE}"
|
||||||
|
alert "See alert file: ${ALERT_FILE}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Attempting to restart ${RUNNER_SERVICE} service..."
|
||||||
|
if systemctl restart "$RUNNER_SERVICE" 2>&1; then
|
||||||
|
record_restart
|
||||||
|
log "Service restart issued. Waiting 15s for runner to register..."
|
||||||
|
sleep 15
|
||||||
|
|
||||||
|
# Re-query to verify recovery
|
||||||
|
RUNNER_COUNT_AFTER=$(query_active_runners 2>/dev/null || echo "0")
|
||||||
|
if (( RUNNER_COUNT_AFTER > 0 )); then
|
||||||
|
log "Recovery SUCCESS: ${RUNNER_COUNT_AFTER} runner(s) online after restart."
|
||||||
|
# Append recovery note to alert file (leave file as audit trail)
|
||||||
|
echo "Recovered : $(date -Iseconds) — ${RUNNER_COUNT_AFTER} runner(s) online after restart" >> "$ALERT_FILE"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
alert "Recovery FAILED: still zero runners after restart."
|
||||||
|
alert "Manual intervention required."
|
||||||
|
alert "Next steps:"
|
||||||
|
alert " 1. ssh root@$(hostname) 'journalctl -u ${RUNNER_SERVICE} -n 100'"
|
||||||
|
alert " 2. Verify registration token: ${GITEA_URL}/user/settings/applications"
|
||||||
|
alert " 3. Re-run: /root/wizards/the-nexus/scripts/provision-runner.sh --token <new-token>"
|
||||||
|
echo "AutoRestart: FAILED at $(date -Iseconds)" >> "$ALERT_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
alert "systemctl restart ${RUNNER_SERVICE} failed — service may not exist on this host."
|
||||||
|
alert "Verify act_runner is installed via provision-runner.sh."
|
||||||
|
echo "AutoRestart: systemctl failed at $(date -Iseconds)" >> "$ALERT_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
16
scripts/systemd/runner-health-probe.service
Normal file
16
scripts/systemd/runner-health-probe.service
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Gitea Runner Health Probe (poka-yoke zero-runner detection)
|
||||||
|
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
|
||||||
|
After=network.target act_runner.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/root/wizards/the-nexus/scripts/runner-health-probe.sh
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
Environment=HOME=/root
|
||||||
|
# Token can be set here or via /etc/act_runner/gitea-probe-token file
|
||||||
|
# EnvironmentFile=/etc/act_runner/probe.env
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
14
scripts/systemd/runner-health-probe.timer
Normal file
14
scripts/systemd/runner-health-probe.timer
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Gitea Runner Health Probe — fires every 5 minutes (poka-yoke #1097)
|
||||||
|
Documentation=https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/issues/1097
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
# Start 2 minutes after boot (let network and act_runner settle)
|
||||||
|
OnBootSec=2min
|
||||||
|
# Then fire every 5 minutes
|
||||||
|
OnUnitActiveSec=5min
|
||||||
|
# Re-fire missed runs after downtime
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
Reference in New Issue
Block a user