diff --git a/bin/agent-loop.sh b/bin/agent-loop.sh index ff892d34..79bab5ee 100755 --- a/bin/agent-loop.sh +++ b/bin/agent-loop.sh @@ -143,6 +143,14 @@ run_worker() { # Generate prompt prompt=$(bash "$(dirname "$0")/agent-dispatch.sh" "$AGENT" "$issue_num" "${repo_owner}/${repo_name}") + dispatch_rc=$? + if [ $dispatch_rc -ne 0 ]; then + log "WORKER-${worker_id}: DISPATCH FAILED #${issue_num} (rc=$dispatch_rc) — key validation or config error" + # ANDON Alert: CRITICAL — loop startup (key validation) failure + "$(dirname "$0")/andon-alert.sh" "CRITICAL" "agent-dispatch failed for ${AGENT} (issue #${issue_num}) — check API keys/tokens" "${AGENT}-loop" 2>/dev/null || true + # Exit worker — watchdog will handle restart/ESCALATION + exit 1 + fi CYCLE_START=$(date +%s) set +e @@ -236,13 +244,25 @@ print(json.dumps({ log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details" mark_skip "$issue_num" "unverified" 1 consecutive_failures=$((consecutive_failures + 1)) + # ANDON Alert: CRITICAL on 3rd consecutive failure + if [ "$consecutive_failures" -eq 3 ]; then + "$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive failures on ${AGENT}-loop (issue #${issue_num}, unverified)" "${AGENT}-loop" 2>/dev/null || true + fi fi elif [ "$exit_code" -eq 124 ]; then log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)" consecutive_failures=$((consecutive_failures + 1)) + # ANDON Alert: CRITICAL on 3rd consecutive failure + if [ "$consecutive_failures" -eq 3 ]; then + "$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive TIMEOUT failures on ${AGENT}-loop (issue #${issue_num})" "${AGENT}-loop" 2>/dev/null || true + fi else log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)" consecutive_failures=$((consecutive_failures + 1)) + # ANDON Alert: CRITICAL on 3rd consecutive failure + if [ "$consecutive_failures" -eq 3 ]; then + "$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive FAILED exits on ${AGENT}-loop (issue #${issue_num}, exit=${exit_code})" "${AGENT}-loop" 2>/dev/null || true + fi fi # ── METRICS ── diff --git a/bin/andon-alert.sh b/bin/andon-alert.sh new file mode 100755 index 00000000..e7a2411b --- /dev/null +++ b/bin/andon-alert.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# andon-alert.sh — Real-time signal light for the factory floor +# Part of Timmy_Foundation/timmy-config issue #347 +# +# Usage: andon-alert.sh "" "" +# andon-alert.sh "CRITICAL" "Groq API key expired" "agent-loop" +# andon-alert.sh "WARNING" "Gemini rate limited 10 times in a row" "gemini-loop" +# andon-alert.sh "HALT" "Jidoka quality gate triggered" "jidoka-gate" +# +# Severity levels: +# INFO — logged only +# WARNING — logged + Telegram +# CRITICAL— logged + Telegram + loop paused (flag file) +# HALT — logged + Telegram + loop killed + flag file + +set -uo pipefail + +SEVERITY="${1:?Usage: andon-alert.sh }" +MESSAGE="${2:?Usage: andon-alert.sh }" +LANE="${3:?Usage: andon-alert.sh }" + +# Normalize severity to uppercase for comparison +SEV_UPPER="$(printf '%s' "$SEVERITY" | tr '[:lower:]' '[:upper:]')" + +# ── Config ── +HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}" +LOG_DIR="$HERMES_HOME/logs" +ANDON_LOG="$LOG_DIR/andon-alert.log" +ANDON_FLAG_DIR="$HERMES_HOME/andon-flags" +ANDON_CRITICAL_DIR="$HERMES_HOME/andon-critical" +ANDON_HALT_DIR="$HERMES_HOME/andon-halt" +TELEGRAM_TOKEN_FILE="$HOME/.config/telegram/special_bot" +TELEGRAM_CHAT="-1003664764329" + +mkdir -p "$LOG_DIR" "$ANDON_FLAG_DIR" "$ANDON_CRITICAL_DIR" "$ANDON_HALT_DIR" + +timestamp() { date '+%Y-%m-%d %H:%M:%S'; } + +log() { + echo "[$(timestamp)] [$SEVERITY] [$LANE] $MESSAGE" | tee -a "$ANDON_LOG" +} + +send_telegram() { + if [ -f "$TELEGRAM_TOKEN_FILE" ]; then + TELEGRAM_TOKEN=$(tr -d '[:space:]' < "$TELEGRAM_TOKEN_FILE") + if [ -n "$TELEGRAM_TOKEN" ]; then + # Build message text + text="${SEVERITY} — ${LANE}: ${MESSAGE}" + # Send via Telegram API + curl -sf --max-time 10 -X POST \ + "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -d "chat_id=${TELEGRAM_CHAT}" \ + -d "text=${text}" >/dev/null 2>&1 || true + fi + fi +} + +# ── Actions by severity ── +case "${SEV_UPPER}" in + INFO) + log "Info event — no further action" + ;; + + WARNING) + log "Warning issued — sending Telegram alert" + send_telegram + ;; + + CRITICAL) + log "Critical issue — sending Telegram + pausing lane" + send_telegram + + # Create CRITICAL pause flag for this lane + # The lane file signals any loop to skip work until cleared + CRITICAL_FLAG="$ANDON_CRITICAL_DIR/${LANE}.flag" + echo "$(timestamp) — $MESSAGE" > "$CRITICAL_FLAG" + + # Also touch general flag for Andon board visibility + touch "$ANDON_FLAG_DIR/${LANE}.critical" + ;; + + HALT) + log "HALT condition — sending Telegram + killing loop processes" + send_telegram + + # Create HALT flag with reason + HALT_FLAG="$ANDON_HALT_DIR/${LANE}.flag" + echo "$(timestamp) — $MESSAGE" > "$HALT_FLAG" + + # Touch general flag for Andon board + touch "$ANDON_FLAG_DIR/${LANE}.halt" + + # Kill processes associated with this lane + # Convention: process names contain the lane identifier + case "$LANE" in + agent-loop|claude-loop|gemini-loop|groq-loop|grok-loop|gemma4-loop) + pkill -f "${LANE}.sh" 2>/dev/null || true + ;; + jidoka-gate) + pkill -f "quality-gate" 2>/dev/null || true + ;; + watchdog) + # Do not kill watchdog itself; this alert is about a restart that already happened + ;; + *) + # Generic: try to kill by lane name + pkill -f "$LANE" 2>/dev/null || true + ;; + esac + ;; + + *) + log "ERROR: Unknown severity '$SEVERITY'. Must be INFO, WARNING, CRITICAL, or HALT." + exit 1 + ;; +esac + +exit 0 diff --git a/bin/pane-watchdog.sh b/bin/pane-watchdog.sh index 76d2eec7..1fe0d3cc 100755 --- a/bin/pane-watchdog.sh +++ b/bin/pane-watchdog.sh @@ -359,6 +359,8 @@ handle_stuck() { if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then log "ESCALATION: $target stuck ${attempts}x — manual intervention needed" echo "ALERT: $target stuck after $attempts restart attempts" >&2 + # ANDON Alert: CRITICAL — watchdog exhausted restart budget + "$(dirname "$0")/andon-alert.sh" "CRITICAL" "Watchdog failed to restart $target after ${attempts} attempts — manual intervention required" "watchdog" 2>/dev/null || true return 1 fi @@ -367,6 +369,8 @@ handle_stuck() { if restart_pane "$target"; then log "OK: $target restarted successfully" + # ANDON Alert: WARNING — watchdog restart succeeded + "$(dirname "$0")/andon-alert.sh" "WARNING" "Watchdog restarted pane $target" "watchdog" 2>/dev/null || true else log "FAIL: $target restart failed (attempt $attempts)" fi diff --git a/bin/start-loops.sh b/bin/start-loops.sh index f9c0f47b..481c614d 100755 --- a/bin/start-loops.sh +++ b/bin/start-loops.sh @@ -23,6 +23,10 @@ log() { log "Running model health check..." if ! bash "$SCRIPT_DIR/model-health-check.sh"; then log "FATAL: Model health check failed. Aborting loop startup." + # ANDON Alert: CRITICAL — loop startup blocked + if [ -x "$SCRIPT_DIR/andon-alert.sh" ]; then + "$SCRIPT_DIR/andon-alert.sh" "CRITICAL" "Model health check failed — loops not started" "startup" 2>/dev/null || true + fi exit 1 fi log "Model health check passed."