- claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
252 lines
8.3 KiB
Bash
252 lines
8.3 KiB
Bash
#!/usr/bin/env bash
|
|
# loop-watchdog.sh — Self-healing monitor for all agent loops
|
|
# Runs every 2 minutes. Restarts dead loops, kills zombies,
|
|
# and files Gitea issues for problems it can't auto-fix.
|
|
#
|
|
# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh &
|
|
|
|
set -uo pipefail
|
|
|
|
LOG_DIR="$HOME/.hermes/logs"
|
|
LOG="$LOG_DIR/watchdog.log"
|
|
ISSUE_LOG="$LOG_DIR/watchdog-issues.json" # tracks filed issues to avoid duplicates
|
|
GITEA_URL="http://143.198.27.163:3000"
|
|
ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)
|
|
ISSUE_REPO="rockachopa/hermes-agent" # ops issues go here
|
|
CHECK_INTERVAL=120 # 2 minutes
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
[ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG"
|
|
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"
|
|
}
|
|
|
|
# File a Gitea issue for problems that can't be auto-fixed.
|
|
# Deduplicates: won't file the same issue_key within 6 hours.
|
|
file_issue() {
|
|
local issue_key="$1"
|
|
local title="$2"
|
|
local body="$3"
|
|
local assignee="${4:-claude}"
|
|
|
|
# Check if we already filed this recently
|
|
local should_file
|
|
should_file=$(python3 -c "
|
|
import json, time
|
|
try:
|
|
with open('$ISSUE_LOG') as f: filed = json.load(f)
|
|
except: filed = {}
|
|
entry = filed.get('$issue_key', {})
|
|
if entry and entry.get('until', 0) > time.time():
|
|
print('no')
|
|
else:
|
|
filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''}
|
|
with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)
|
|
print('yes')
|
|
" 2>/dev/null)
|
|
|
|
if [ "$should_file" != "yes" ]; then
|
|
return 0
|
|
fi
|
|
|
|
log "FILING ISSUE: $title"
|
|
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
|
|
-H "Authorization: token ${ADMIN_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$(python3 -c "
|
|
import json
|
|
print(json.dumps({
|
|
'title': '[watchdog] $title',
|
|
'body': '''$body
|
|
|
|
---
|
|
*Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''',
|
|
'assignees': ['$assignee'],
|
|
}))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
|
|
}
|
|
|
|
# === HEALTH CHECKS ===
|
|
|
|
check_loop() {
|
|
local name="$1" # kimi | claude | gemini
|
|
local grep_pat="$2" # pattern to find the loop process
|
|
local wake_cmd="$3" # command to restart
|
|
local log_file="$4" # log to check for errors
|
|
local worker_pat="${5:-}" # optional: pattern for worker processes
|
|
|
|
local pid
|
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
|
|
|
if [ -z "$pid" ]; then
|
|
log "$name loop DOWN — restarting..."
|
|
eval "$wake_cmd"
|
|
sleep 3
|
|
|
|
# Verify it came back
|
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
|
if [ -z "$pid" ]; then
|
|
file_issue \
|
|
"${name}-loop-dead" \
|
|
"${name} loop won't start" \
|
|
"The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \
|
|
"claude"
|
|
else
|
|
log "$name loop restarted (PID $pid)"
|
|
fi
|
|
return
|
|
fi
|
|
|
|
# Loop is running — check for stalls
|
|
if [ -f "$log_file" ]; then
|
|
local last_activity
|
|
last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0)
|
|
local now
|
|
now=$(date +%s)
|
|
local stale_seconds=$(( now - last_activity ))
|
|
|
|
# If no log activity for 30 minutes, something is wrong
|
|
if [ "$stale_seconds" -gt 1800 ]; then
|
|
log "$name loop STALE — no activity for ${stale_seconds}s"
|
|
|
|
# Check if it's just idle (empty queue) vs truly stuck
|
|
local last_line
|
|
last_line=$(tail -1 "$log_file" 2>/dev/null)
|
|
if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then
|
|
# Just idle, that's fine
|
|
return
|
|
fi
|
|
|
|
# Kill and restart
|
|
log "$name loop stuck — killing and restarting..."
|
|
pkill -f "$grep_pat" 2>/dev/null
|
|
[ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null
|
|
sleep 2
|
|
eval "$wake_cmd"
|
|
sleep 3
|
|
|
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
|
if [ -z "$pid" ]; then
|
|
file_issue \
|
|
"${name}-loop-stuck" \
|
|
"${name} loop stuck and won't restart" \
|
|
"The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \
|
|
"claude"
|
|
else
|
|
log "$name loop recovered (PID $pid)"
|
|
fi
|
|
fi
|
|
|
|
# Check for crash loops (5+ failures in last 50 lines)
|
|
local recent_failures
|
|
recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true)
|
|
if [ "$recent_failures" -ge 5 ]; then
|
|
local error_sample
|
|
error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5)
|
|
file_issue \
|
|
"${name}-crash-loop" \
|
|
"${name} agent in crash loop (${recent_failures} recent failures)" \
|
|
"The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \
|
|
"claude"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_gitea() {
|
|
if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then
|
|
log "Gitea UNREACHABLE"
|
|
file_issue \
|
|
"gitea-down" \
|
|
"Gitea instance unreachable" \
|
|
"The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \
|
|
"claude"
|
|
fi
|
|
}
|
|
|
|
check_zombies() {
|
|
local stuck_git
|
|
stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
|
|
local orphan_py
|
|
orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')
|
|
|
|
if [ "$stuck_git" -gt 3 ]; then
|
|
log "Killing $stuck_git stuck git processes"
|
|
pkill -f "git.*push\|git-remote-http" 2>/dev/null || true
|
|
fi
|
|
|
|
if [ "$orphan_py" -gt 3 ]; then
|
|
log "Killing $orphan_py orphaned pytest processes"
|
|
pkill -f "pytest tests/" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
check_disk() {
|
|
local worktree_count
|
|
worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
if [ "$worktree_count" -gt 30 ]; then
|
|
log "WARN: $worktree_count worktrees — possible leak"
|
|
file_issue \
|
|
"worktree-leak" \
|
|
"Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \
|
|
"There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \
|
|
"claude"
|
|
fi
|
|
}
|
|
|
|
check_skip_lists() {
|
|
# If all agents have full skip lists, the whole system is stuck
|
|
for agent in claude gemini kimi; do
|
|
local skip_file="$LOG_DIR/${agent}-skip-list.json"
|
|
[ -f "$skip_file" ] || continue
|
|
local skip_count
|
|
skip_count=$(python3 -c "
|
|
import json, time
|
|
try:
|
|
with open('$skip_file') as f: skips = json.load(f)
|
|
active = sum(1 for v in skips.values() if v.get('until',0) > time.time())
|
|
print(active)
|
|
except: print(0)
|
|
" 2>/dev/null)
|
|
|
|
if [ "${skip_count:-0}" -gt 10 ]; then
|
|
file_issue \
|
|
"${agent}-skip-overload" \
|
|
"${agent} has ${skip_count} skipped issues — systemic failure" \
|
|
"The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \
|
|
"claude"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# === MAIN ===
|
|
log "=== Watchdog Started ==="
|
|
|
|
while true; do
|
|
# Gitea must be up for anything to work
|
|
check_gitea
|
|
|
|
# Check each agent loop
|
|
check_loop "kimi" "kimi-loop.sh" \
|
|
"nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \
|
|
"$LOG_DIR/kimi-loop.log" \
|
|
"kimi.*--print"
|
|
|
|
check_loop "claude" "claude-loop.sh" \
|
|
"nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \
|
|
"$LOG_DIR/claude-loop.log" \
|
|
"claude.*--print.*--dangerously"
|
|
|
|
check_loop "gemini" "gemini-loop.sh" \
|
|
"nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \
|
|
"$LOG_DIR/gemini-loop.log" \
|
|
"gemini.*-p"
|
|
|
|
# Housekeeping
|
|
check_zombies
|
|
check_disk
|
|
check_skip_lists
|
|
|
|
sleep "$CHECK_INTERVAL"
|
|
done
|