feat(guards): add jidoka-gate.sh and andon-alert.sh — Japanese wisdom guards
andon-alert.sh — Real-time signal light for the fleet Severity levels: INFO (log only), WARNING (Telegram), CRITICAL (pause loop), HALT (kill + flag) Sends Telegram alerts for WARNING and above Creates halt flag files for HALT severity jidoka-gate.sh — Stop the line on defect Checks last N agent completions for valid PRs Halts agent loop if too many failures (configurable threshold) Calls andon-alert.sh HALT on quality gate failure Wired into: - loop-watchdog.sh: checks halt flags before restart, runs jidoka every 4th cycle - agent-loop.sh: andon WARNING on API key preflight failure - Symlinked to ~/.hermes/bin/ for fleet-wide access README.md updated with guard #6 and #7 documentation.
This commit is contained in:
@@ -33,6 +33,19 @@ Each is a standalone script that can be called from loop scripts, CI, or git hoo
|
||||
**Exit code:** 0 = attempt allowed, 1 = max exceeded
|
||||
**Default:** 3 attempts max. State stored in `~/.hermes/logs/<agent>-attempts.json`
|
||||
|
||||
### 6. andon-alert.sh
|
||||
**Purpose:** Real-time signal light for the fleet. Logs all events, sends Telegram alerts for WARNING+, pauses loops on CRITICAL, halts with flag file on HALT.
|
||||
**Usage:** `./andon-alert.sh SEVERITY MESSAGE SOURCE`
|
||||
**Severities:** INFO (log only), WARNING (Telegram ⚠️), CRITICAL (Telegram 🔴 + pause), HALT (Telegram 🛑 + kill + flag file)
|
||||
**Flag files:** `~/.hermes/logs/<source>-jidoka-halt` (created on HALT, checked by watchdog)
|
||||
|
||||
### 7. jidoka-gate.sh
|
||||
**Purpose:** Stop the line on defect. Checks last N agent completions for valid PRs. If too many fail, triggers HALT via andon-alert.
|
||||
**Usage:** `./jidoka-gate.sh <agent-name> [check-count] [fail-threshold]`
|
||||
**Exit code:** 0 = quality OK, 1 = quality below threshold (line halted)
|
||||
**Default:** Checks last 5, halts if 3+ have no valid PR.
|
||||
**Integration:** Called by loop-watchdog.sh every 4th cycle (~1 hour).
|
||||
|
||||
## Integration
|
||||
|
||||
```bash
|
||||
|
||||
48
hermes-sovereign/guards/andon-alert.sh
Executable file
48
hermes-sovereign/guards/andon-alert.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
# andon-alert.sh — Real-time signal light for the fleet
|
||||
# Usage: andon-alert.sh SEVERITY MESSAGE SOURCE
|
||||
# Severity: INFO | WARNING | CRITICAL | HALT
|
||||
set -eo pipefail
|
||||
|
||||
SEVERITY="${1:-INFO}"
|
||||
MESSAGE="${2:-No message}"
|
||||
SOURCE="${3:-unknown}"
|
||||
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
LOG_FILE="$HOME/.hermes/logs/andon.log"
|
||||
BOT_TOKEN=$(cat ~/.config/telegram/special_bot 2>/dev/null)
|
||||
CHAT_ID="-1003664764329"
|
||||
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
|
||||
# Always log
|
||||
echo "[$TIMESTAMP] [$SEVERITY] [$SOURCE] $MESSAGE" >> "$LOG_FILE"
|
||||
|
||||
# Telegram for WARNING and above
|
||||
if [ "$SEVERITY" != "INFO" ] && [ -n "$BOT_TOKEN" ]; then
|
||||
ICON="⚠️"
|
||||
[ "$SEVERITY" = "CRITICAL" ] && ICON="🔴"
|
||||
[ "$SEVERITY" = "HALT" ] && ICON="🛑"
|
||||
|
||||
MSG="$ICON *ANDON — $SEVERITY*
|
||||
_Source: $SOURCE_
|
||||
$MESSAGE
|
||||
_$TIMESTAMP_"
|
||||
|
||||
curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
|
||||
-d "chat_id=${CHAT_ID}" \
|
||||
-d "text=${MSG}" \
|
||||
-d "parse_mode=Markdown" > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
# For CRITICAL: pause the source loop
|
||||
if [ "$SEVERITY" = "CRITICAL" ]; then
|
||||
pkill -f "${SOURCE}" 2>/dev/null || true
|
||||
echo "[$TIMESTAMP] CRITICAL: Paused $SOURCE" >> "$LOG_FILE"
|
||||
fi
|
||||
|
||||
# For HALT: kill + create flag file
|
||||
if [ "$SEVERITY" = "HALT" ]; then
|
||||
pkill -f "${SOURCE}" 2>/dev/null || true
|
||||
touch "$HOME/.hermes/logs/${SOURCE}-jidoka-halt"
|
||||
echo "[$TIMESTAMP] HALT: Killed $SOURCE, flag file created" >> "$LOG_FILE"
|
||||
fi
|
||||
82
hermes-sovereign/guards/jidoka-gate.sh
Executable file
82
hermes-sovereign/guards/jidoka-gate.sh
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# jidoka-gate.sh — Stop the line on defect
|
||||
# Usage: jidoka-gate.sh <agent-name> [check-count] [fail-threshold]
|
||||
# Checks last N completions, halts if too many fail quality
|
||||
set -eo pipefail
|
||||
|
||||
AGENT="${1:?Usage: jidoka-gate.sh <agent-name>}"
|
||||
CHECK_COUNT="${2:-5}"
|
||||
FAIL_THRESHOLD="${3:-3}"
|
||||
GUARD_DIR="$HOME/.hermes/bin"
|
||||
TOKEN=$(cat ~/.hermes/gitea_token_vps 2>/dev/null)
|
||||
API="https://forge.alexanderwhitestone.com/api/v1"
|
||||
LOG="$HOME/.hermes/logs/jidoka.log"
|
||||
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
|
||||
mkdir -p "$(dirname "$LOG")"
|
||||
|
||||
# Find the agent's loop log
|
||||
if [ "$AGENT" = "claude" ]; then
|
||||
LOOP_LOG="$HOME/.hermes/logs/claude-loop.log"
|
||||
elif [ "$AGENT" = "gemini" ]; then
|
||||
LOOP_LOG="$HOME/.hermes/logs/gemini-loop.log"
|
||||
else
|
||||
LOOP_LOG="$HOME/.hermes/logs/${AGENT}-loop.log"
|
||||
fi
|
||||
|
||||
# Get last N completed issue numbers
|
||||
COMPLETED=$(grep -oP '#\K[0-9]+' <<< "$(grep 'complete' "$LOOP_LOG" 2>/dev/null | tail -$CHECK_COUNT)" 2>/dev/null | sort -u) || true
|
||||
|
||||
if [ -z "$COMPLETED" ]; then
|
||||
echo "[$TIMESTAMP] No completions to check for $AGENT" >> "$LOG"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check each — does the PR actually exist with real changes?
|
||||
PASSED=0
|
||||
FAILED=0
|
||||
DETAILS=""
|
||||
|
||||
for issue_num in $COMPLETED; do
|
||||
# Search for a PR with this issue number in the branch name
|
||||
PR_DATA=$(curl -s "$API/repos/Timmy_Foundation/the-nexus/pulls?state=open&limit=50" \
|
||||
-H "Authorization: token $TOKEN" 2>/dev/null | \
|
||||
python3 -c "import sys,json; prs=json.load(sys.stdin); matches=[p for p in prs if 'issue-$issue_num' in p.get('head',{}).get('ref','')]; print(json.dumps(matches[0]) if matches else 'null')" 2>/dev/null) || true
|
||||
|
||||
if [ "$PR_DATA" = "null" ] || [ -z "$PR_DATA" ]; then
|
||||
# Try other repos
|
||||
for repo in hermes-agent timmy-config timmy-home; do
|
||||
PR_DATA=$(curl -s "$API/repos/Timmy_Foundation/$repo/pulls?state=open&limit=50" \
|
||||
-H "Authorization: token $TOKEN" 2>/dev/null | \
|
||||
python3 -c "import sys,json; prs=json.load(sys.stdin); matches=[p for p in prs if 'issue-$issue_num' in p.get('head',{}).get('ref','')]; print(json.dumps(matches[0]) if matches else 'null')" 2>/dev/null) || true
|
||||
[ "$PR_DATA" != "null" ] && [ -n "$PR_DATA" ] && break
|
||||
done
|
||||
fi
|
||||
|
||||
if [ "$PR_DATA" = "null" ] || [ -z "$PR_DATA" ]; then
|
||||
FAILED=$((FAILED+1))
|
||||
DETAILS="$DETAILS\n #$issue_num: NO PR FOUND"
|
||||
else
|
||||
FILES=$(echo "$PR_DATA" | python3 -c "import sys,json; print(json.load(sys.stdin).get('changed_files',0))" 2>/dev/null) || true
|
||||
if [ "$FILES" -gt 0 ] 2>/dev/null; then
|
||||
PASSED=$((PASSED+1))
|
||||
DETAILS="$DETAILS\n #$issue_num: OK ($FILES files)"
|
||||
else
|
||||
FAILED=$((FAILED+1))
|
||||
DETAILS="$DETAILS\n #$issue_num: EMPTY PR (0 files)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[$TIMESTAMP] JIDOKA CHECK: $AGENT — $PASSED passed, $FAILED failed of $((PASSED+FAILED)) checked" >> "$LOG"
|
||||
echo -e "$DETAILS" >> "$LOG"
|
||||
|
||||
# If failures exceed threshold: HALT
|
||||
if [ "$FAILED" -ge "$FAIL_THRESHOLD" ]; then
|
||||
echo "[$TIMESTAMP] JIDOKA HALT: $AGENT quality below threshold ($FAILED/$((PASSED+FAILED)) failed)" >> "$LOG"
|
||||
"$GUARD_DIR/andon-alert.sh" "HALT" "Quality gate failed: $AGENT — $FAILED of $((PASSED+FAILED)) completions have no valid PR. Line stopped." "$AGENT-loop"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "JIDOKA: $AGENT quality OK ($PASSED passed, $FAILED failed)"
|
||||
exit 0
|
||||
Reference in New Issue
Block a user