From 5bbb09bd58577c33e1bd289e66eafbed65605ebd Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Tue, 7 Apr 2026 10:35:22 -0400 Subject: [PATCH] =?UTF-8?q?feat(guards):=20add=20jidoka-gate.sh=20and=20an?= =?UTF-8?q?don-alert.sh=20=E2=80=94=20Japanese=20wisdom=20guards?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit andon-alert.sh — Real-time signal light for the fleet Severity levels: INFO (log only), WARNING (Telegram), CRITICAL (pause loop), HALT (kill + flag) Sends Telegram alerts for WARNING and above Creates halt flag files for HALT severity jidoka-gate.sh — Stop the line on defect Checks last N agent completions for valid PRs Halts agent loop if too many failures (configurable threshold) Calls andon-alert.sh HALT on quality gate failure Wired into: - loop-watchdog.sh: checks halt flags before restart, runs jidoka every 4th cycle - agent-loop.sh: andon WARNING on API key preflight failure - Symlinked to ~/.hermes/bin/ for fleet-wide access README.md updated with guard #6 and #7 documentation. --- hermes-sovereign/guards/README.md | 13 ++++ hermes-sovereign/guards/andon-alert.sh | 48 +++++++++++++++ hermes-sovereign/guards/jidoka-gate.sh | 82 ++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100755 hermes-sovereign/guards/andon-alert.sh create mode 100755 hermes-sovereign/guards/jidoka-gate.sh diff --git a/hermes-sovereign/guards/README.md b/hermes-sovereign/guards/README.md index 07fcd112..0b189e09 100644 --- a/hermes-sovereign/guards/README.md +++ b/hermes-sovereign/guards/README.md @@ -33,6 +33,19 @@ Each is a standalone script that can be called from loop scripts, CI, or git hoo **Exit code:** 0 = attempt allowed, 1 = max exceeded **Default:** 3 attempts max. State stored in `~/.hermes/logs/-attempts.json` +### 6. andon-alert.sh +**Purpose:** Real-time signal light for the fleet. Logs all events, sends Telegram alerts for WARNING+, pauses loops on CRITICAL, halts with flag file on HALT. +**Usage:** `./andon-alert.sh SEVERITY MESSAGE SOURCE` +**Severities:** INFO (log only), WARNING (Telegram ⚠️), CRITICAL (Telegram 🔴 + pause), HALT (Telegram 🛑 + kill + flag file) +**Flag files:** `~/.hermes/logs/-jidoka-halt` (created on HALT, checked by watchdog) + +### 7. jidoka-gate.sh +**Purpose:** Stop the line on defect. Checks last N agent completions for valid PRs. If too many fail, triggers HALT via andon-alert. +**Usage:** `./jidoka-gate.sh [check-count] [fail-threshold]` +**Exit code:** 0 = quality OK, 1 = quality below threshold (line halted) +**Default:** Checks last 5, halts if 3+ have no valid PR. +**Integration:** Called by loop-watchdog.sh every 4th cycle (~1 hour). + ## Integration ```bash diff --git a/hermes-sovereign/guards/andon-alert.sh b/hermes-sovereign/guards/andon-alert.sh new file mode 100755 index 00000000..2eeca6b7 --- /dev/null +++ b/hermes-sovereign/guards/andon-alert.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# andon-alert.sh — Real-time signal light for the fleet +# Usage: andon-alert.sh SEVERITY MESSAGE SOURCE +# Severity: INFO | WARNING | CRITICAL | HALT +set -eo pipefail + +SEVERITY="${1:-INFO}" +MESSAGE="${2:-No message}" +SOURCE="${3:-unknown}" +TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') +LOG_FILE="$HOME/.hermes/logs/andon.log" +BOT_TOKEN=$(cat ~/.config/telegram/special_bot 2>/dev/null) +CHAT_ID="-1003664764329" + +mkdir -p "$(dirname "$LOG_FILE")" + +# Always log +echo "[$TIMESTAMP] [$SEVERITY] [$SOURCE] $MESSAGE" >> "$LOG_FILE" + +# Telegram for WARNING and above +if [ "$SEVERITY" != "INFO" ] && [ -n "$BOT_TOKEN" ]; then + ICON="⚠️" + [ "$SEVERITY" = "CRITICAL" ] && ICON="🔴" + [ "$SEVERITY" = "HALT" ] && ICON="🛑" + + MSG="$ICON *ANDON — $SEVERITY* +_Source: $SOURCE_ +$MESSAGE +_$TIMESTAMP_" + + curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + -d "chat_id=${CHAT_ID}" \ + -d "text=${MSG}" \ + -d "parse_mode=Markdown" > /dev/null 2>&1 +fi + +# For CRITICAL: pause the source loop +if [ "$SEVERITY" = "CRITICAL" ]; then + pkill -f "${SOURCE}" 2>/dev/null || true + echo "[$TIMESTAMP] CRITICAL: Paused $SOURCE" >> "$LOG_FILE" +fi + +# For HALT: kill + create flag file +if [ "$SEVERITY" = "HALT" ]; then + pkill -f "${SOURCE}" 2>/dev/null || true + touch "$HOME/.hermes/logs/${SOURCE}-jidoka-halt" + echo "[$TIMESTAMP] HALT: Killed $SOURCE, flag file created" >> "$LOG_FILE" +fi diff --git a/hermes-sovereign/guards/jidoka-gate.sh b/hermes-sovereign/guards/jidoka-gate.sh new file mode 100755 index 00000000..04089b44 --- /dev/null +++ b/hermes-sovereign/guards/jidoka-gate.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# jidoka-gate.sh — Stop the line on defect +# Usage: jidoka-gate.sh [check-count] [fail-threshold] +# Checks last N completions, halts if too many fail quality +set -eo pipefail + +AGENT="${1:?Usage: jidoka-gate.sh }" +CHECK_COUNT="${2:-5}" +FAIL_THRESHOLD="${3:-3}" +GUARD_DIR="$HOME/.hermes/bin" +TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null) +API="https://forge.alexanderwhitestone.com/api/v1" +LOG="$HOME/.hermes/logs/jidoka.log" +TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') + +mkdir -p "$(dirname "$LOG")" + +# Find the agent's loop log +if [ "$AGENT" = "claude" ]; then + LOOP_LOG="$HOME/.hermes/logs/claude-loop.log" +elif [ "$AGENT" = "gemini" ]; then + LOOP_LOG="$HOME/.hermes/logs/gemini-loop.log" +else + LOOP_LOG="$HOME/.hermes/logs/${AGENT}-loop.log" +fi + +# Get last N completed issue numbers +COMPLETED=$(grep -oP '#\K[0-9]+' <<< "$(grep 'complete' "$LOOP_LOG" 2>/dev/null | tail -$CHECK_COUNT)" 2>/dev/null | sort -u) || true + +if [ -z "$COMPLETED" ]; then + echo "[$TIMESTAMP] No completions to check for $AGENT" >> "$LOG" + exit 0 +fi + +# Check each — does the PR actually exist with real changes? +PASSED=0 +FAILED=0 +DETAILS="" + +for issue_num in $COMPLETED; do + # Search for a PR with this issue number in the branch name + PR_DATA=$(curl -s "$API/repos/Timmy_Foundation/the-nexus/pulls?state=open&limit=50" \ + -H "Authorization: token $TOKEN" 2>/dev/null | \ + python3 -c "import sys,json; prs=json.load(sys.stdin); matches=[p for p in prs if 'issue-$issue_num' in p.get('head',{}).get('ref','')]; print(json.dumps(matches[0]) if matches else 'null')" 2>/dev/null) || true + + if [ "$PR_DATA" = "null" ] || [ -z "$PR_DATA" ]; then + # Try other repos + for repo in hermes-agent timmy-config timmy-home; do + PR_DATA=$(curl -s "$API/repos/Timmy_Foundation/$repo/pulls?state=open&limit=50" \ + -H "Authorization: token $TOKEN" 2>/dev/null | \ + python3 -c "import sys,json; prs=json.load(sys.stdin); matches=[p for p in prs if 'issue-$issue_num' in p.get('head',{}).get('ref','')]; print(json.dumps(matches[0]) if matches else 'null')" 2>/dev/null) || true + [ "$PR_DATA" != "null" ] && [ -n "$PR_DATA" ] && break + done + fi + + if [ "$PR_DATA" = "null" ] || [ -z "$PR_DATA" ]; then + FAILED=$((FAILED+1)) + DETAILS="$DETAILS\n #$issue_num: NO PR FOUND" + else + FILES=$(echo "$PR_DATA" | python3 -c "import sys,json; print(json.load(sys.stdin).get('changed_files',0))" 2>/dev/null) || true + if [ "$FILES" -gt 0 ] 2>/dev/null; then + PASSED=$((PASSED+1)) + DETAILS="$DETAILS\n #$issue_num: OK ($FILES files)" + else + FAILED=$((FAILED+1)) + DETAILS="$DETAILS\n #$issue_num: EMPTY PR (0 files)" + fi + fi +done + +echo "[$TIMESTAMP] JIDOKA CHECK: $AGENT — $PASSED passed, $FAILED failed of $((PASSED+FAILED)) checked" >> "$LOG" +echo -e "$DETAILS" >> "$LOG" + +# If failures exceed threshold: HALT +if [ "$FAILED" -ge "$FAIL_THRESHOLD" ]; then + echo "[$TIMESTAMP] JIDOKA HALT: $AGENT quality below threshold ($FAILED/$((PASSED+FAILED)) failed)" >> "$LOG" + "$GUARD_DIR/andon-alert.sh" "HALT" "Quality gate failed: $AGENT — $FAILED of $((PASSED+FAILED)) completions have no valid PR. Line stopped." "$AGENT-loop" + exit 1 +fi + +echo "JIDOKA: $AGENT quality OK ($PASSED passed, $FAILED failed)" +exit 0