feat(andon): implement real-time signal light system (#347)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s
- Add bin/andon-alert.sh with severity levels: INFO, WARNING, CRITICAL, HALT
- Wire into agent-loop: CRITICAL on 3 consecutive failures and dispatch key validation errors
- Wire into start-loops: CRITICAL on model health check failure
- Wire into pane-watchdog: WARNING on successful restart, CRITICAL on max restart exhaustion
- Creates ~/.hermes/andon-{flags,critical,halt} directories for state
The Andon system makes failures visible immediately via Telegram alerts
and local flag files, turning the factory floor into a transparent
operations center.
Closes #347
This commit is contained in:
@@ -143,6 +143,14 @@ run_worker() {
|
||||
|
||||
# Generate prompt
|
||||
prompt=$(bash "$(dirname "$0")/agent-dispatch.sh" "$AGENT" "$issue_num" "${repo_owner}/${repo_name}")
|
||||
dispatch_rc=$?
|
||||
if [ $dispatch_rc -ne 0 ]; then
|
||||
log "WORKER-${worker_id}: DISPATCH FAILED #${issue_num} (rc=$dispatch_rc) — key validation or config error"
|
||||
# ANDON Alert: CRITICAL — loop startup (key validation) failure
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "agent-dispatch failed for ${AGENT} (issue #${issue_num}) — check API keys/tokens" "${AGENT}-loop" 2>/dev/null || true
|
||||
# Exit worker — watchdog will handle restart/ESCALATION
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CYCLE_START=$(date +%s)
|
||||
set +e
|
||||
@@ -236,13 +244,25 @@ print(json.dumps({
|
||||
log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details"
|
||||
mark_skip "$issue_num" "unverified" 1
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive failures on ${AGENT}-loop (issue #${issue_num}, unverified)" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive TIMEOUT failures on ${AGENT}-loop (issue #${issue_num})" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
else
|
||||
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive FAILED exits on ${AGENT}-loop (issue #${issue_num}, exit=${exit_code})" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── METRICS ──
|
||||
|
||||
118
bin/andon-alert.sh
Executable file
118
bin/andon-alert.sh
Executable file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env bash
|
||||
# andon-alert.sh — Real-time signal light for the factory floor
|
||||
# Part of Timmy_Foundation/timmy-config issue #347
|
||||
#
|
||||
# Usage: andon-alert.sh <SEVERITY> "<MESSAGE>" "<LANE>"
|
||||
# andon-alert.sh "CRITICAL" "Groq API key expired" "agent-loop"
|
||||
# andon-alert.sh "WARNING" "Gemini rate limited 10 times in a row" "gemini-loop"
|
||||
# andon-alert.sh "HALT" "Jidoka quality gate triggered" "jidoka-gate"
|
||||
#
|
||||
# Severity levels:
|
||||
# INFO — logged only
|
||||
# WARNING — logged + Telegram
|
||||
# CRITICAL— logged + Telegram + loop paused (flag file)
|
||||
# HALT — logged + Telegram + loop killed + flag file
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SEVERITY="${1:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
||||
MESSAGE="${2:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
||||
LANE="${3:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
||||
|
||||
# Normalize severity to uppercase for comparison
|
||||
SEV_UPPER="$(printf '%s' "$SEVERITY" | tr '[:lower:]' '[:upper:]')"
|
||||
|
||||
# ── Config ──
|
||||
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
||||
LOG_DIR="$HERMES_HOME/logs"
|
||||
ANDON_LOG="$LOG_DIR/andon-alert.log"
|
||||
ANDON_FLAG_DIR="$HERMES_HOME/andon-flags"
|
||||
ANDON_CRITICAL_DIR="$HERMES_HOME/andon-critical"
|
||||
ANDON_HALT_DIR="$HERMES_HOME/andon-halt"
|
||||
TELEGRAM_TOKEN_FILE="$HOME/.config/telegram/special_bot"
|
||||
TELEGRAM_CHAT="-1003664764329"
|
||||
|
||||
mkdir -p "$LOG_DIR" "$ANDON_FLAG_DIR" "$ANDON_CRITICAL_DIR" "$ANDON_HALT_DIR"
|
||||
|
||||
timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
|
||||
|
||||
log() {
|
||||
echo "[$(timestamp)] [$SEVERITY] [$LANE] $MESSAGE" | tee -a "$ANDON_LOG"
|
||||
}
|
||||
|
||||
send_telegram() {
|
||||
if [ -f "$TELEGRAM_TOKEN_FILE" ]; then
|
||||
TELEGRAM_TOKEN=$(tr -d '[:space:]' < "$TELEGRAM_TOKEN_FILE")
|
||||
if [ -n "$TELEGRAM_TOKEN" ]; then
|
||||
# Build message text
|
||||
text="${SEVERITY} — ${LANE}: ${MESSAGE}"
|
||||
# Send via Telegram API
|
||||
curl -sf --max-time 10 -X POST \
|
||||
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${TELEGRAM_CHAT}" \
|
||||
-d "text=${text}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Actions by severity ──
|
||||
case "${SEV_UPPER}" in
|
||||
INFO)
|
||||
log "Info event — no further action"
|
||||
;;
|
||||
|
||||
WARNING)
|
||||
log "Warning issued — sending Telegram alert"
|
||||
send_telegram
|
||||
;;
|
||||
|
||||
CRITICAL)
|
||||
log "Critical issue — sending Telegram + pausing lane"
|
||||
send_telegram
|
||||
|
||||
# Create CRITICAL pause flag for this lane
|
||||
# The lane file signals any loop to skip work until cleared
|
||||
CRITICAL_FLAG="$ANDON_CRITICAL_DIR/${LANE}.flag"
|
||||
echo "$(timestamp) — $MESSAGE" > "$CRITICAL_FLAG"
|
||||
|
||||
# Also touch general flag for Andon board visibility
|
||||
touch "$ANDON_FLAG_DIR/${LANE}.critical"
|
||||
;;
|
||||
|
||||
HALT)
|
||||
log "HALT condition — sending Telegram + killing loop processes"
|
||||
send_telegram
|
||||
|
||||
# Create HALT flag with reason
|
||||
HALT_FLAG="$ANDON_HALT_DIR/${LANE}.flag"
|
||||
echo "$(timestamp) — $MESSAGE" > "$HALT_FLAG"
|
||||
|
||||
# Touch general flag for Andon board
|
||||
touch "$ANDON_FLAG_DIR/${LANE}.halt"
|
||||
|
||||
# Kill processes associated with this lane
|
||||
# Convention: process names contain the lane identifier
|
||||
case "$LANE" in
|
||||
agent-loop|claude-loop|gemini-loop|groq-loop|grok-loop|gemma4-loop)
|
||||
pkill -f "${LANE}.sh" 2>/dev/null || true
|
||||
;;
|
||||
jidoka-gate)
|
||||
pkill -f "quality-gate" 2>/dev/null || true
|
||||
;;
|
||||
watchdog)
|
||||
# Do not kill watchdog itself; this alert is about a restart that already happened
|
||||
;;
|
||||
*)
|
||||
# Generic: try to kill by lane name
|
||||
pkill -f "$LANE" 2>/dev/null || true
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
*)
|
||||
log "ERROR: Unknown severity '$SEVERITY'. Must be INFO, WARNING, CRITICAL, or HALT."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
@@ -359,6 +359,8 @@ handle_stuck() {
|
||||
if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then
|
||||
log "ESCALATION: $target stuck ${attempts}x — manual intervention needed"
|
||||
echo "ALERT: $target stuck after $attempts restart attempts" >&2
|
||||
# ANDON Alert: CRITICAL — watchdog exhausted restart budget
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "Watchdog failed to restart $target after ${attempts} attempts — manual intervention required" "watchdog" 2>/dev/null || true
|
||||
return 1
|
||||
fi
|
||||
|
||||
@@ -367,6 +369,8 @@ handle_stuck() {
|
||||
|
||||
if restart_pane "$target"; then
|
||||
log "OK: $target restarted successfully"
|
||||
# ANDON Alert: WARNING — watchdog restart succeeded
|
||||
"$(dirname "$0")/andon-alert.sh" "WARNING" "Watchdog restarted pane $target" "watchdog" 2>/dev/null || true
|
||||
else
|
||||
log "FAIL: $target restart failed (attempt $attempts)"
|
||||
fi
|
||||
|
||||
@@ -23,6 +23,10 @@ log() {
|
||||
log "Running model health check..."
|
||||
if ! bash "$SCRIPT_DIR/model-health-check.sh"; then
|
||||
log "FATAL: Model health check failed. Aborting loop startup."
|
||||
# ANDON Alert: CRITICAL — loop startup blocked
|
||||
if [ -x "$SCRIPT_DIR/andon-alert.sh" ]; then
|
||||
"$SCRIPT_DIR/andon-alert.sh" "CRITICAL" "Model health check failed — loops not started" "startup" 2>/dev/null || true
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
log "Model health check passed."
|
||||
|
||||
Reference in New Issue
Block a user