feat: parallel workers for all agents, dynamic scaling, self-healing watchdog

- claude-loop: 7 workers default, scales up to 21, 5s cooldown
- gemini-loop: rewritten as parallel worker system (3→12), multi-repo,
  auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible
- loop-watchdog: monitors all loops every 2min, auto-restarts dead loops,
  kills zombies, files Gitea issues for unfixable problems
- ops-helpers: added ops-wake-watchdog, ops-kill-watchdog
- All scripts use file-based PID tracking (bash 3.2 safe)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alexander Whitestone
2026-03-22 19:22:18 -04:00
parent 0dba33c811
commit 5f8129d346
5 changed files with 660 additions and 220 deletions

View File

@@ -8,12 +8,13 @@
set -euo pipefail
# === CONFIG ===
NUM_WORKERS="${1:-3}"
NUM_WORKERS="${1:-7}"
MAX_WORKERS=21 # absolute ceiling
WORKTREE_BASE="$HOME/worktrees"
GITEA_URL="http://143.198.27.163:3000"
GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
CLAUDE_TIMEOUT=900 # 15 min per issue
COOLDOWN=15 # seconds between launching workers
COOLDOWN=5 # seconds between issues (fast cycle)
RATE_LIMIT_SLEEP=60 # initial sleep on rate limit
MAX_RATE_SLEEP=300 # max backoff on rate limit
LOG_DIR="$HOME/.hermes/logs"
@@ -402,18 +403,73 @@ else: print('')
}
# === MAIN ===
log "=== Claude Loop Started — ${NUM_WORKERS} workers ==="
log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
log "Worktrees: ${WORKTREE_BASE}"
# Clean stale locks
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
# Launch workers
# PID tracking via files (bash 3.2 compatible)
PID_DIR="$LOG_DIR/claude-pids"
mkdir -p "$PID_DIR"
rm -f "$PID_DIR"/*.pid 2>/dev/null
launch_worker() {
local wid="$1"
run_worker "$wid" &
echo $! > "$PID_DIR/${wid}.pid"
log "Launched worker $wid (PID $!)"
}
# Initial launch
for i in $(seq 1 "$NUM_WORKERS"); do
run_worker "$i" &
log "Launched worker $i (PID $!)"
sleep 5 # stagger starts
launch_worker "$i"
sleep 3
done
# Wait for all workers
wait
# === DYNAMIC SCALER ===
# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits
CURRENT_WORKERS="$NUM_WORKERS"
while true; do
sleep 180
# Reap dead workers and relaunch
for pidfile in "$PID_DIR"/*.pid; do
[ -f "$pidfile" ] || continue
wid=$(basename "$pidfile" .pid)
wpid=$(cat "$pidfile")
if ! kill -0 "$wpid" 2>/dev/null; then
log "SCALER: Worker $wid died — relaunching"
launch_worker "$wid"
sleep 2
fi
done
recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
if [ "$recent_rate_limits" -gt 0 ]; then
if [ "$CURRENT_WORKERS" -gt 2 ]; then
drop_to=$(( CURRENT_WORKERS / 2 ))
[ "$drop_to" -lt 2 ] && drop_to=2
log "SCALER: Rate limited — scaling ${CURRENT_WORKERS}${drop_to} workers"
for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
if [ -f "$PID_DIR/${wid}.pid" ]; then
kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
rm -f "$PID_DIR/${wid}.pid"
update_active "$wid" "" "" "done"
fi
done
CURRENT_WORKERS=$drop_to
fi
elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
new_count=$(( CURRENT_WORKERS + 2 ))
[ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
log "SCALER: Healthy — scaling ${CURRENT_WORKERS}${new_count} workers"
for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
launch_worker "$wid"
sleep 2
done
CURRENT_WORKERS=$new_count
fi
done