feat(andon): implement real-time signal light system (#347)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s

- Add bin/andon-alert.sh with severity levels: INFO, WARNING, CRITICAL, HALT
- Wire into agent-loop: CRITICAL on 3 consecutive failures and dispatch key validation errors
- Wire into start-loops: CRITICAL on model health check failure
- Wire into pane-watchdog: WARNING on successful restart, CRITICAL on max restart exhaustion
- Creates ~/.hermes/andon-{flags,critical,halt} directories for state

The Andon system makes failures visible immediately via Telegram alerts
and local flag files, turning the factory floor into a transparent
operations center.

Closes #347
This commit is contained in:
Alexander Payne
2026-04-27 09:41:57 -04:00
parent 34a1e68e67
commit a4261d2dff
4 changed files with 146 additions and 0 deletions

View File

@@ -143,6 +143,14 @@ run_worker() {
# Generate prompt
prompt=$(bash "$(dirname "$0")/agent-dispatch.sh" "$AGENT" "$issue_num" "${repo_owner}/${repo_name}")
dispatch_rc=$?
if [ $dispatch_rc -ne 0 ]; then
log "WORKER-${worker_id}: DISPATCH FAILED #${issue_num} (rc=$dispatch_rc) — key validation or config error"
# ANDON Alert: CRITICAL — loop startup (key validation) failure
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "agent-dispatch failed for ${AGENT} (issue #${issue_num}) — check API keys/tokens" "${AGENT}-loop" 2>/dev/null || true
# Exit worker — watchdog will handle restart/ESCALATION
exit 1
fi
CYCLE_START=$(date +%s)
set +e
@@ -236,13 +244,25 @@ print(json.dumps({
log "WORKER-${worker_id}: UNVERIFIED #${issue_num}$verify_details"
mark_skip "$issue_num" "unverified" 1
consecutive_failures=$((consecutive_failures + 1))
# ANDON Alert: CRITICAL on 3rd consecutive failure
if [ "$consecutive_failures" -eq 3 ]; then
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive failures on ${AGENT}-loop (issue #${issue_num}, unverified)" "${AGENT}-loop" 2>/dev/null || true
fi
fi
elif [ "$exit_code" -eq 124 ]; then
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
consecutive_failures=$((consecutive_failures + 1))
# ANDON Alert: CRITICAL on 3rd consecutive failure
if [ "$consecutive_failures" -eq 3 ]; then
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive TIMEOUT failures on ${AGENT}-loop (issue #${issue_num})" "${AGENT}-loop" 2>/dev/null || true
fi
else
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
consecutive_failures=$((consecutive_failures + 1))
# ANDON Alert: CRITICAL on 3rd consecutive failure
if [ "$consecutive_failures" -eq 3 ]; then
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive FAILED exits on ${AGENT}-loop (issue #${issue_num}, exit=${exit_code})" "${AGENT}-loop" 2>/dev/null || true
fi
fi
# ── METRICS ──