Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s
- Add bin/andon-alert.sh with severity levels: INFO, WARNING, CRITICAL, HALT
- Wire into agent-loop: CRITICAL on 3 consecutive failures and dispatch key validation errors
- Wire into start-loops: CRITICAL on model health check failure
- Wire into pane-watchdog: WARNING on successful restart, CRITICAL on max restart exhaustion
- Creates ~/.hermes/andon-{flags,critical,halt} directories for state
The Andon system makes failures visible immediately via Telegram alerts
and local flag files, turning the factory floor into a transparent
operations center.
Closes #347
119 lines
3.8 KiB
Bash
Executable File
119 lines
3.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# andon-alert.sh — Real-time signal light for the factory floor
|
|
# Part of Timmy_Foundation/timmy-config issue #347
|
|
#
|
|
# Usage: andon-alert.sh <SEVERITY> "<MESSAGE>" "<LANE>"
|
|
# andon-alert.sh "CRITICAL" "Groq API key expired" "agent-loop"
|
|
# andon-alert.sh "WARNING" "Gemini rate limited 10 times in a row" "gemini-loop"
|
|
# andon-alert.sh "HALT" "Jidoka quality gate triggered" "jidoka-gate"
|
|
#
|
|
# Severity levels:
|
|
# INFO — logged only
|
|
# WARNING — logged + Telegram
|
|
# CRITICAL— logged + Telegram + loop paused (flag file)
|
|
# HALT — logged + Telegram + loop killed + flag file
|
|
|
|
set -uo pipefail
|
|
|
|
SEVERITY="${1:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
|
MESSAGE="${2:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
|
LANE="${3:?Usage: andon-alert.sh <SEVERITY> <MESSAGE> <LANE>}"
|
|
|
|
# Normalize severity to uppercase for comparison
|
|
SEV_UPPER="$(printf '%s' "$SEVERITY" | tr '[:lower:]' '[:upper:]')"
|
|
|
|
# ── Config ──
|
|
HERMES_HOME="${HERMES_HOME:-$HOME/.hermes}"
|
|
LOG_DIR="$HERMES_HOME/logs"
|
|
ANDON_LOG="$LOG_DIR/andon-alert.log"
|
|
ANDON_FLAG_DIR="$HERMES_HOME/andon-flags"
|
|
ANDON_CRITICAL_DIR="$HERMES_HOME/andon-critical"
|
|
ANDON_HALT_DIR="$HERMES_HOME/andon-halt"
|
|
TELEGRAM_TOKEN_FILE="$HOME/.config/telegram/special_bot"
|
|
TELEGRAM_CHAT="-1003664764329"
|
|
|
|
mkdir -p "$LOG_DIR" "$ANDON_FLAG_DIR" "$ANDON_CRITICAL_DIR" "$ANDON_HALT_DIR"
|
|
|
|
timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
|
|
|
|
log() {
|
|
echo "[$(timestamp)] [$SEVERITY] [$LANE] $MESSAGE" | tee -a "$ANDON_LOG"
|
|
}
|
|
|
|
send_telegram() {
|
|
if [ -f "$TELEGRAM_TOKEN_FILE" ]; then
|
|
TELEGRAM_TOKEN=$(tr -d '[:space:]' < "$TELEGRAM_TOKEN_FILE")
|
|
if [ -n "$TELEGRAM_TOKEN" ]; then
|
|
# Build message text
|
|
text="${SEVERITY} — ${LANE}: ${MESSAGE}"
|
|
# Send via Telegram API
|
|
curl -sf --max-time 10 -X POST \
|
|
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
-d "chat_id=${TELEGRAM_CHAT}" \
|
|
-d "text=${text}" >/dev/null 2>&1 || true
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# ── Actions by severity ──
|
|
case "${SEV_UPPER}" in
|
|
INFO)
|
|
log "Info event — no further action"
|
|
;;
|
|
|
|
WARNING)
|
|
log "Warning issued — sending Telegram alert"
|
|
send_telegram
|
|
;;
|
|
|
|
CRITICAL)
|
|
log "Critical issue — sending Telegram + pausing lane"
|
|
send_telegram
|
|
|
|
# Create CRITICAL pause flag for this lane
|
|
# The lane file signals any loop to skip work until cleared
|
|
CRITICAL_FLAG="$ANDON_CRITICAL_DIR/${LANE}.flag"
|
|
echo "$(timestamp) — $MESSAGE" > "$CRITICAL_FLAG"
|
|
|
|
# Also touch general flag for Andon board visibility
|
|
touch "$ANDON_FLAG_DIR/${LANE}.critical"
|
|
;;
|
|
|
|
HALT)
|
|
log "HALT condition — sending Telegram + killing loop processes"
|
|
send_telegram
|
|
|
|
# Create HALT flag with reason
|
|
HALT_FLAG="$ANDON_HALT_DIR/${LANE}.flag"
|
|
echo "$(timestamp) — $MESSAGE" > "$HALT_FLAG"
|
|
|
|
# Touch general flag for Andon board
|
|
touch "$ANDON_FLAG_DIR/${LANE}.halt"
|
|
|
|
# Kill processes associated with this lane
|
|
# Convention: process names contain the lane identifier
|
|
case "$LANE" in
|
|
agent-loop|claude-loop|gemini-loop|groq-loop|grok-loop|gemma4-loop)
|
|
pkill -f "${LANE}.sh" 2>/dev/null || true
|
|
;;
|
|
jidoka-gate)
|
|
pkill -f "quality-gate" 2>/dev/null || true
|
|
;;
|
|
watchdog)
|
|
# Do not kill watchdog itself; this alert is about a restart that already happened
|
|
;;
|
|
*)
|
|
# Generic: try to kill by lane name
|
|
pkill -f "$LANE" 2>/dev/null || true
|
|
;;
|
|
esac
|
|
;;
|
|
|
|
*)
|
|
log "ERROR: Unknown severity '$SEVERITY'. Must be INFO, WARNING, CRITICAL, or HALT."
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit 0
|