feat(andon): implement real-time signal light system (#347)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s
Some checks failed
Smoke Test / smoke (pull_request) Failing after 20s
Architecture Lint / Linter Tests (pull_request) Successful in 24s
Validate Config / YAML Lint (pull_request) Failing after 15s
Validate Config / JSON Validate (pull_request) Successful in 17s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 57s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Deploy Script Dry Run (pull_request) Successful in 11s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Architecture Lint / Lint Repository (pull_request) Failing after 21s
PR Checklist / pr-checklist (pull_request) Successful in 5m4s
- Add bin/andon-alert.sh with severity levels: INFO, WARNING, CRITICAL, HALT
- Wire into agent-loop: CRITICAL on 3 consecutive failures and dispatch key validation errors
- Wire into start-loops: CRITICAL on model health check failure
- Wire into pane-watchdog: WARNING on successful restart, CRITICAL on max restart exhaustion
- Creates ~/.hermes/andon-{flags,critical,halt} directories for state
The Andon system makes failures visible immediately via Telegram alerts
and local flag files, turning the factory floor into a transparent
operations center.
Closes #347
This commit is contained in:
@@ -143,6 +143,14 @@ run_worker() {
|
||||
|
||||
# Generate prompt
|
||||
prompt=$(bash "$(dirname "$0")/agent-dispatch.sh" "$AGENT" "$issue_num" "${repo_owner}/${repo_name}")
|
||||
dispatch_rc=$?
|
||||
if [ $dispatch_rc -ne 0 ]; then
|
||||
log "WORKER-${worker_id}: DISPATCH FAILED #${issue_num} (rc=$dispatch_rc) — key validation or config error"
|
||||
# ANDON Alert: CRITICAL — loop startup (key validation) failure
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "agent-dispatch failed for ${AGENT} (issue #${issue_num}) — check API keys/tokens" "${AGENT}-loop" 2>/dev/null || true
|
||||
# Exit worker — watchdog will handle restart/ESCALATION
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CYCLE_START=$(date +%s)
|
||||
set +e
|
||||
@@ -236,13 +244,25 @@ print(json.dumps({
|
||||
log "WORKER-${worker_id}: UNVERIFIED #${issue_num} — $verify_details"
|
||||
mark_skip "$issue_num" "unverified" 1
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive failures on ${AGENT}-loop (issue #${issue_num}, unverified)" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
elif [ "$exit_code" -eq 124 ]; then
|
||||
log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive TIMEOUT failures on ${AGENT}-loop (issue #${issue_num})" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
else
|
||||
log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)"
|
||||
consecutive_failures=$((consecutive_failures + 1))
|
||||
# ANDON Alert: CRITICAL on 3rd consecutive failure
|
||||
if [ "$consecutive_failures" -eq 3 ]; then
|
||||
"$(dirname "$0")/andon-alert.sh" "CRITICAL" "3 consecutive FAILED exits on ${AGENT}-loop (issue #${issue_num}, exit=${exit_code})" "${AGENT}-loop" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── METRICS ──
|
||||
|
||||
Reference in New Issue
Block a user