feat: parallel workers for all agents, dynamic scaling, self-healing watchdog

- claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:22:18 -04:00
parent 0dba33c811
commit 5f8129d346
5 changed files with 660 additions and 220 deletions
--- a/bin/claude-loop.sh
+++ b/bin/claude-loop.sh
@@ -8,12 +8,13 @@
 set -euo pipefail

 # === CONFIG ===
-NUM_WORKERS="${1:-3}"
+NUM_WORKERS="${1:-7}"
+MAX_WORKERS=21        # absolute ceiling
 WORKTREE_BASE="$HOME/worktrees"
 GITEA_URL="http://143.198.27.163:3000"
 GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
 CLAUDE_TIMEOUT=900   # 15 min per issue
-COOLDOWN=15           # seconds between launching workers
+COOLDOWN=5            # seconds between issues (fast cycle)
 RATE_LIMIT_SLEEP=60   # initial sleep on rate limit
 MAX_RATE_SLEEP=300    # max backoff on rate limit
 LOG_DIR="$HOME/.hermes/logs"
@@ -402,18 +403,73 @@ else: print('')
 }

 # === MAIN ===
-log "=== Claude Loop Started — ${NUM_WORKERS} workers ==="
+log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
 log "Worktrees: ${WORKTREE_BASE}"

 # Clean stale locks
 rm -rf "$LOCK_DIR"/*.lock 2>/dev/null

-# Launch workers
+# PID tracking via files (bash 3.2 compatible)
+PID_DIR="$LOG_DIR/claude-pids"
+mkdir -p "$PID_DIR"
+rm -f "$PID_DIR"/*.pid 2>/dev/null
+
+launch_worker() {
+  local wid="$1"
+  run_worker "$wid" &
+  echo $! > "$PID_DIR/${wid}.pid"
+  log "Launched worker $wid (PID $!)"
+}
+
+# Initial launch
 for i in $(seq 1 "$NUM_WORKERS"); do
-  run_worker "$i" &
-  log "Launched worker $i (PID $!)"
-  sleep 5  # stagger starts
+  launch_worker "$i"
+  sleep 3
 done

-# Wait for all workers
-wait
+# === DYNAMIC SCALER ===
+# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits
+CURRENT_WORKERS="$NUM_WORKERS"
+while true; do
+  sleep 180
+
+  # Reap dead workers and relaunch
+  for pidfile in "$PID_DIR"/*.pid; do
+    [ -f "$pidfile" ] || continue
+    wid=$(basename "$pidfile" .pid)
+    wpid=$(cat "$pidfile")
+    if ! kill -0 "$wpid" 2>/dev/null; then
+      log "SCALER: Worker $wid died — relaunching"
+      launch_worker "$wid"
+      sleep 2
+    fi
+  done
+
+  recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
+  recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
+
+  if [ "$recent_rate_limits" -gt 0 ]; then
+    if [ "$CURRENT_WORKERS" -gt 2 ]; then
+      drop_to=$(( CURRENT_WORKERS / 2 ))
+      [ "$drop_to" -lt 2 ] && drop_to=2
+      log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers"
+      for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
+        if [ -f "$PID_DIR/${wid}.pid" ]; then
+          kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
+          rm -f "$PID_DIR/${wid}.pid"
+          update_active "$wid" "" "" "done"
+        fi
+      done
+      CURRENT_WORKERS=$drop_to
+    fi
+  elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
+    new_count=$(( CURRENT_WORKERS + 2 ))
+    [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
+    log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers"
+    for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
+      launch_worker "$wid"
+      sleep 2
+    done
+    CURRENT_WORKERS=$new_count
+  fi
+done