feat: parallel workers for all agents, dynamic scaling, self-healing watchdog

- claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:22:18 -04:00
parent 0dba33c811
commit 5f8129d346
5 changed files with 660 additions and 220 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -73,6 +73,7 @@ bin/*
 !bin/timmy-loopstat.sh
 !bin/start-dashboard.sh
 !bin/gemini-loop.sh
 !bin/loop-watchdog.sh
 # ── Queue (transient task queue) ─────────────────────────────────────
 queue/
--- a/bin/claude-loop.sh
+++ b/bin/claude-loop.sh
@@ -8,12 +8,13 @@
 set -euo pipefail
 # === CONFIG ===
-NUM_WORKERS="${1:-3}"
+NUM_WORKERS="${1:-7}"
 MAX_WORKERS=21        # absolute ceiling
 WORKTREE_BASE="$HOME/worktrees"
 GITEA_URL="http://143.198.27.163:3000"
 GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
 CLAUDE_TIMEOUT=900   # 15 min per issue
-COOLDOWN=15           # seconds between launching workers
+COOLDOWN=5            # seconds between issues (fast cycle)
 RATE_LIMIT_SLEEP=60   # initial sleep on rate limit
 MAX_RATE_SLEEP=300    # max backoff on rate limit
 LOG_DIR="$HOME/.hermes/logs"
@@ -402,18 +403,73 @@ else: print('')
 }
 # === MAIN ===
-log "=== Claude Loop Started — ${NUM_WORKERS} workers ==="
+log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
 log "Worktrees: ${WORKTREE_BASE}"
 # Clean stale locks
 rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
-# Launch workers
+# PID tracking via files (bash 3.2 compatible)
 PID_DIR="$LOG_DIR/claude-pids"
 mkdir -p "$PID_DIR"
 rm -f "$PID_DIR"/*.pid 2>/dev/null
 launch_worker() {
  local wid="$1"
  run_worker "$wid" &
  echo $! > "$PID_DIR/${wid}.pid"
  log "Launched worker $wid (PID $!)"
 }
 # Initial launch
 for i in $(seq 1 "$NUM_WORKERS"); do
-  run_worker "$i" &
+  launch_worker "$i"
-  log "Launched worker $i (PID $!)"
+  sleep 3
  sleep 5  # stagger starts
 done
-# Wait for all workers
+# === DYNAMIC SCALER ===
-wait
+# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits
 CURRENT_WORKERS="$NUM_WORKERS"
 while true; do
  sleep 180
  # Reap dead workers and relaunch
  for pidfile in "$PID_DIR"/*.pid; do
    [ -f "$pidfile" ] || continue
    wid=$(basename "$pidfile" .pid)
    wpid=$(cat "$pidfile")
    if ! kill -0 "$wpid" 2>/dev/null; then
      log "SCALER: Worker $wid died — relaunching"
      launch_worker "$wid"
      sleep 2
    fi
  done
  recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
  recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
  if [ "$recent_rate_limits" -gt 0 ]; then
    if [ "$CURRENT_WORKERS" -gt 2 ]; then
      drop_to=$(( CURRENT_WORKERS / 2 ))
      [ "$drop_to" -lt 2 ] && drop_to=2
      log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers"
      for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
        if [ -f "$PID_DIR/${wid}.pid" ]; then
          kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
          rm -f "$PID_DIR/${wid}.pid"
          update_active "$wid" "" "" "done"
        fi
      done
      CURRENT_WORKERS=$drop_to
    fi
  elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
    new_count=$(( CURRENT_WORKERS + 2 ))
    [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
    log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers"
    for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
      launch_worker "$wid"
      sleep 2
    done
    CURRENT_WORKERS=$new_count
  fi
 done
--- a/bin/gemini-loop.sh
+++ b/bin/gemini-loop.sh
@@ -1,321 +1,437 @@
 #!/usr/bin/env bash
-# gemini-loop.sh — Dropout-proof Gemini code agent dispatch loop
+# gemini-loop.sh — Parallel Gemini Code agent dispatch loop
-# Picks an open issue from Gitea, creates a worktree, runs Gemini Code CLI,
+# Runs N workers concurrently against the Gitea backlog.
-# handles failures gracefully, and loops forever.
+# Dynamic scaling: starts at N, scales up to MAX, drops on rate limits.
 #
-# Dropout-proof means:
+# Usage: gemini-loop.sh [NUM_WORKERS]   (default: 3)
 #   - If Gemini Code crashes/hangs, we kill it and move on
 #   - If worktree creation fails, skip and retry
 #   - If push fails, log and continue
 #   - Exponential backoff on repeated failures
 #   - Clean up worktrees after PR is created
 set -euo pipefail
 # === CONFIG ===
-REPO_DIR="$HOME/worktrees/gemini-repo"
+NUM_WORKERS="${1:-3}"
 MAX_WORKERS=12
 WORKTREE_BASE="$HOME/worktrees"
 GITEA_URL="http://143.198.27.163:3000"
 GITEA_TOKEN=$(cat "$HOME/.hermes/gemini_token")
 REPO_OWNER="rockachopa"
 REPO_NAME="Timmy-time-dashboard"
 GEMINI_TIMEOUT=600   # 10 min per issue
-COOLDOWN=30          # seconds between issues
+COOLDOWN=5            # seconds between issues
-MAX_FAILURES=5       # consecutive failures before long sleep
+RATE_LIMIT_SLEEP=60
-LONG_SLEEP=300       # 5 min backoff on repeated failures
+MAX_RATE_SLEEP=300
 LOG_DIR="$HOME/.hermes/logs"
-SKIP_FILE="$LOG_DIR/gemini-skip-list.json"  # issues to skip temporarily
+SKIP_FILE="$LOG_DIR/gemini-skip-list.json"
 LOCK_DIR="$LOG_DIR/gemini-locks"
 ACTIVE_FILE="$LOG_DIR/gemini-active.json"
-mkdir -p "$LOG_DIR" "$WORKTREE_BASE"
+mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR"
 # Initialize skip file if missing
 [ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
 echo '{}' > "$ACTIVE_FILE"
-# === STATE ===
+# === SHARED FUNCTIONS ===
-failure_count=0
+log() {
-issues_completed=0
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_DIR/gemini-loop.log"
 }
-# === SKIP LIST FUNCTIONS ===
+lock_issue() {
-is_skipped() {
+  local issue_key="$1"
-  local issue_num="$1"
+  local lockfile="$LOCK_DIR/$issue_key.lock"
-  python3 -c "
+  if mkdir "$lockfile" 2>/dev/null; then
-import json, time, sys
+    echo $$ > "$lockfile/pid"
-try:
+    return 0
-    with open('$SKIP_FILE') as f: skips = json.load(f)
+  fi
-except: skips = {}
+  return 1
-entry = skips.get(str($issue_num), {})
+}
-if entry and entry.get('until', 0) > time.time():
+
-    print('skip')
+unlock_issue() {
-    sys.exit(0)
+  rm -rf "$LOCK_DIR/$1.lock" 2>/dev/null
 # Expired or not found — clean up and allow
 if str($issue_num) in skips:
    del skips[str($issue_num)]
    with open('$SKIP_FILE', 'w') as f: json.dump(skips, f)
 print('ok')
 " 2>/dev/null
 }
 mark_skip() {
-  local issue_num="$1"
+  local issue_num="$1" reason="$2" skip_hours="${3:-1}"
  local reason="$2"
  local skip_hours="${3:-1}"  # default 1 hour
  python3 -c "
-import json, time
+import json, time, fcntl
-try:
+with open('$SKIP_FILE', 'r+') as f:
-    with open('$SKIP_FILE') as f: skips = json.load(f)
+    fcntl.flock(f, fcntl.LOCK_EX)
-except: skips = {}
+    try: skips = json.load(f)
-skips[str($issue_num)] = {
+    except: skips = {}
-    'until': time.time() + ($skip_hours * 3600),
+    skips[str($issue_num)] = {
-    'reason': '$reason',
+        'until': time.time() + ($skip_hours * 3600),
-    'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
+        'reason': '$reason',
-}
+        'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
-# If 3+ failures, skip for 6 hours instead
+    }
-if skips[str($issue_num)]['failures'] >= 3:
+    if skips[str($issue_num)]['failures'] >= 3:
-    skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
+        skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
-with open('$SKIP_FILE', 'w') as f: json.dump(skips, f, indent=2)
+    f.seek(0)
    f.truncate()
    json.dump(skips, f, indent=2)
 " 2>/dev/null
-  log "SKIP: #${issue_num} added to skip list — ${reason}"
+  log "SKIP: #${issue_num} — ${reason}"
 }
-log() {
+update_active() {
-  local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
+  local worker="$1" issue="$2" repo="$3" status="$4"
-  echo "$msg" >> "$LOG_DIR/gemini-loop.log"
+  python3 -c "
 import json, fcntl
 with open('$ACTIVE_FILE', 'r+') as f:
    fcntl.flock(f, fcntl.LOCK_EX)
    try: active = json.load(f)
    except: active = {}
    if '$status' == 'done':
        active.pop('$worker', None)
    else:
        active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'}
    f.seek(0)
    f.truncate()
    json.dump(active, f, indent=2)
 " 2>/dev/null
 }
 cleanup_worktree() {
-  local wt="$1"
+  local wt="$1" branch="$2"
  local branch="$2"
  if [ -d "$wt" ]; then
-    cd "$REPO_DIR"
+    local parent
    parent=$(git -C "$wt" rev-parse --git-common-dir 2>/dev/null | sed 's|/.git$||' || true)
    [ -n "$parent" ] && [ -d "$parent" ] && cd "$parent"
    git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt"
    git worktree prune 2>/dev/null
    git branch -D "$branch" 2>/dev/null || true
    log "Cleaned up worktree: $wt"
  fi
 }
 get_next_issue() {
-  # Get open issues ASSIGNED TO GEMINI only — Gemini works its own queue
+  python3 -c "
-  # NOTE: Gitea's assignee filter is unreliable — we validate in Python
+import json, sys, time, urllib.request, os
  local skip_file="$SKIP_FILE"
  curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues?state=open&type=issues&limit=50&sort=created" \
    -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
 import sys, json, time
-issues = json.load(sys.stdin)
+token = '${GITEA_TOKEN}'
-# Reverse to oldest-first (Gitea returns newest-first) — respects dependency order
+base = '${GITEA_URL}'
-issues.reverse()
+repos = [
    'rockachopa/Timmy-time-dashboard',
    'rockachopa/alexanderwhitestone.com',
    'rockachopa/hermes-agent',
    'replit/timmy-tower',
    'replit/token-gated-economy',
 ]
 # Load skip list
 try:
-    with open('${skip_file}') as f: skips = json.load(f)
+    with open('${SKIP_FILE}') as f: skips = json.load(f)
 except: skips = {}
-for i in issues:
+try:
-    # MUST be assigned to gemini (Gitea filter is broken, validate here)
+    with open('${ACTIVE_FILE}') as f:
        active = json.load(f)
        active_issues = {v['issue'] for v in active.values()}
 except:
    active_issues = set()
 all_issues = []
 for repo in repos:
    url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created'
    req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
    try:
        resp = urllib.request.urlopen(req, timeout=10)
        issues = json.loads(resp.read())
        for i in issues:
            i['_repo'] = repo
        all_issues.extend(issues)
    except:
        continue
 def priority(i):
    t = i['title'].lower()
    if '[urgent]' in t or 'urgent:' in t: return 0
    if '[p0]' in t: return 1
    if '[p1]' in t: return 2
    if '[bug]' in t: return 3
    if 'lhf:' in t or 'lhf ' in t: return 4
    if '[p2]' in t: return 5
    return 6
 all_issues.sort(key=priority)
 for i in all_issues:
    assignees = [a['login'] for a in (i.get('assignees') or [])]
    if 'gemini' not in assignees:
        continue
    title = i['title'].lower()
    # Skip philosophy, epics, showcases, features (not 10-min code work)
    if '[philosophy]' in title: continue
    if '[epic]' in title or 'epic:' in title: continue
    if '[showcase]' in title: continue
    if '[feature]' in title: continue
    # Check skip list
    num_str = str(i['number'])
-    entry = skips.get(num_str, {})
+    if num_str in active_issues: continue
    if entry and entry.get('until', 0) > time.time():
        continue
-    print(json.dumps({'number': i['number'], 'title': i['title']}))
+    entry = skips.get(num_str, {})
    if entry and entry.get('until', 0) > time.time(): continue
    lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock'
    if os.path.isdir(lock): continue
    repo = i['_repo']
    owner, name = repo.split('/')
    print(json.dumps({
        'number': i['number'],
        'title': i['title'],
        'repo_owner': owner,
        'repo_name': name,
        'repo': repo,
    }))
    sys.exit(0)
 print('null')
 " 2>/dev/null
 }
 build_prompt() {
-  local issue_num="$1"
+  local issue_num="$1" issue_title="$2" worktree="$3" repo_owner="$4" repo_name="$5"
  local issue_title="$2"
  local worktree="$3"
  cat <<PROMPT
-You are Gemini, an autonomous code agent on the Timmy-time-dashboard project.
+You are Gemini, an autonomous code agent on the ${repo_name} project.
 YOUR ISSUE: #${issue_num} — "${issue_title}"
 GITEA API: ${GITEA_URL}/api/v1
 GITEA TOKEN: ${GITEA_TOKEN}
-REPO: ${REPO_OWNER}/${REPO_NAME}
+REPO: ${repo_owner}/${repo_name}
 WORKING DIRECTORY: ${worktree}
 == YOUR POWERS ==
-You can do ANYTHING a developer can do. You are not limited to the narrow task.
+You can do ANYTHING a developer can do.
-1. READ the issue. Read any comments — they may have instructions.
+1. READ the issue and any comments for context:
-   curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}" 
+   curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}"
-   curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}/comments"
+   curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments"
 2. DO THE WORK. Code, test, fix, refactor — whatever the issue needs.
-   - tox -e format (auto-format first)
+   - Check for tox.ini / Makefile / package.json for test/lint commands
-   - tox -e unit (all tests must pass)
+   - Run tests if the project has them
-   - tox -e lint (must be clean)
+   - Follow existing code conventions
 3. COMMIT with conventional commits: fix: / feat: / refactor: / test: / chore:
   Include "Fixes #${issue_num}" or "Refs #${issue_num}" in the message.
 4. PUSH to your branch (gemini/issue-${issue_num}) and CREATE A PR:
-   curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls" \
+   git push origin gemini/issue-${issue_num}
-     -H "Authorization: token ${GITEA_TOKEN}" \
+   curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \\
-     -H "Content-Type: application/json" \
+     -H "Authorization: token ${GITEA_TOKEN}" \\
     -H "Content-Type: application/json" \\
     -d '{"title": "[gemini] <description> (#${issue_num})", "body": "Fixes #${issue_num}\n\n<describe what you did>", "head": "gemini/issue-${issue_num}", "base": "main"}'
 5. COMMENT on the issue when done:
-   curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}/comments" \
+   curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\
-     -H "Authorization: token ${GITEA_TOKEN}" \
+     -H "Authorization: token ${GITEA_TOKEN}" \\
-     -H "Content-Type: application/json" \
+     -H "Content-Type: application/json" \\
     -d '{"body": "PR created. <summary of changes>"}'
 6. FILE NEW ISSUES if you find bugs, missing tests, or improvements while working:
   curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues" \
     -H "Authorization: token ${GITEA_TOKEN}" \
     -H "Content-Type: application/json" \
     -d '{"title": "[gemini-generated] <title>", "body": "<description>"}'
 == RULES ==
 - Read CLAUDE.md or project README first for conventions
- tox is the ONLY way to run tests/lint/format. Never run pytest/ruff directly.
+- If the project has tox, use tox. If npm, use npm. Follow the project.
 - Never use --no-verify on git commands.
 - If tests fail after 2 attempts, STOP and comment on the issue explaining why.
- Be thorough. If you see something broken nearby, file an issue for it.
+- Be thorough but focused. Fix the issue, don't refactor the world.
 PROMPT
 }
-# === MAIN LOOP ===
+# === WORKER FUNCTION ===
-log "=== Gemini Loop Started ==="
+run_worker() {
-log "Repo: ${REPO_DIR}"
+  local worker_id="$1"
-log "Worktrees: ${WORKTREE_BASE}"
+  local consecutive_failures=0
-while true; do
+  log "WORKER-${worker_id}: Started"
  # Check for too many consecutive failures
  if [ "$failure_count" -ge "$MAX_FAILURES" ]; then
    log "BACKOFF: ${failure_count} consecutive failures. Sleeping ${LONG_SLEEP}s..."
    sleep "$LONG_SLEEP"
    failure_count=0
  fi
-  # Fetch latest main (resilient — never die on git errors)
+  while true; do
-  cd "$REPO_DIR"
+    if [ "$consecutive_failures" -ge 5 ]; then
-  timeout 60 git fetch origin main 2>/dev/null || { log "WARN: git fetch failed, continuing anyway"; }
+      local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5)))
-  git checkout main 2>/dev/null || true
+      [ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP
-  git reset --hard origin/main 2>/dev/null || true
+      log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)"
-
+      sleep "$backoff"
-  # Get next issue
+      consecutive_failures=0
  issue_json=$(get_next_issue)
  if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
    # Only log idle ONCE, then go quiet until work appears
    if [ "${LAST_STATE:-}" != "idle" ]; then
      log "Queue empty. Waiting for assignments..."
      LAST_STATE="idle"
    fi
    sleep "$LONG_SLEEP"
    continue
  fi
  LAST_STATE="working"
-  issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
+    issue_json=$(get_next_issue)
  issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
  branch="gemini/issue-${issue_num}"
  worktree="${WORKTREE_BASE}/gemini-${issue_num}"
-  log "=== ISSUE #${issue_num}: ${issue_title} ==="
+    if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
      update_active "$worker_id" "" "" "idle"
      sleep 60
      continue
    fi
-  # Create worktree
+    issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
-  if [ -d "$worktree" ]; then
+    issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
-    log "Worktree already exists, cleaning..."
+    repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])")
-    cleanup_worktree "$worktree" "$branch"
+    repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])")
-  fi
+    issue_key="${repo_owner}-${repo_name}-${issue_num}"
    branch="gemini/issue-${issue_num}"
    worktree="${WORKTREE_BASE}/gemini-w${worker_id}-${issue_num}"
-  cd "$REPO_DIR"
+    if ! lock_issue "$issue_key"; then
-  if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
+      sleep 5
-    log "ERROR: Failed to create worktree for #${issue_num}"
+      continue
-    failure_count=$((failure_count + 1))
+    fi
    sleep "$COOLDOWN"
    continue
  fi
-  # Configure git remote with gemini's token so it can push
+    log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
-  cd "$worktree"
+    update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
  git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${REPO_OWNER}/${REPO_NAME}.git"
  cd "$REPO_DIR"
-  # Build prompt
+    # Ensure local clone
-  prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree")
+    local_repo="${WORKTREE_BASE}/gemini-base-${repo_owner}-${repo_name}"
    if [ ! -d "$local_repo" ]; then
      log "WORKER-${worker_id}: Cloning ${repo_owner}/${repo_name}..."
      git clone --depth=1 "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" "$local_repo" 2>&1 || {
        log "WORKER-${worker_id}: ERROR cloning"
        unlock_issue "$issue_key"
        consecutive_failures=$((consecutive_failures + 1))
        sleep "$COOLDOWN"
        continue
      }
      cd "$local_repo"
      git fetch --unshallow origin main 2>/dev/null || true
    fi
-  # Run Gemini Code CLI with timeout
+    cd "$local_repo"
-  log "Launching Gemini Code for #${issue_num} (timeout: ${GEMINI_TIMEOUT}s)..."
+    timeout 60 git fetch origin main 2>/dev/null || true
    git checkout main 2>/dev/null || true
    git reset --hard origin/main 2>/dev/null || true
-  set +e
+    [ -d "$worktree" ] && cleanup_worktree "$worktree" "$branch"
-  cd "$worktree"
+    cd "$local_repo"
  gtimeout "$GEMINI_TIMEOUT" gemini \
    --print \
    --quiet \
    -w "$worktree" \
    -p "$prompt" \
    </dev/null 2>&1 | tee "$LOG_DIR/gemini-${issue_num}.log"
  exit_code=${PIPESTATUS[0]}
  cd "$REPO_DIR"
  set -e
-  if [ "$exit_code" -eq 0 ]; then
+    if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
-    log "SUCCESS: #${issue_num} completed — attempting auto-merge..."
+      log "WORKER-${worker_id}: ERROR creating worktree"
      unlock_issue "$issue_key"
      consecutive_failures=$((consecutive_failures + 1))
      sleep "$COOLDOWN"
      continue
    fi
-    # Find and merge the PR gemini created
+    cd "$worktree"
-    pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls?state=open&head=${REPO_OWNER}:${branch}&limit=1" \
+    git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
-      -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
+
    prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
    log "WORKER-${worker_id}: Launching Gemini Code for #${issue_num}..."
    set +e
    cd "$worktree"
    gtimeout "$GEMINI_TIMEOUT" gemini \
      -p "$prompt" \
      --yolo \
      </dev/null >> "$LOG_DIR/gemini-${issue_num}.log" 2>&1
    exit_code=$?
    set -e
    if [ "$exit_code" -eq 0 ]; then
      log "WORKER-${worker_id}: SUCCESS #${issue_num}"
      pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \
        -H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
 import sys,json
 prs = json.load(sys.stdin)
 if prs: print(prs[0]['number'])
 else: print('')
 " 2>/dev/null)
-    if [ -n "$pr_num" ]; then
+      if [ -n "$pr_num" ]; then
-      merge_result=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls/${pr_num}/merge" \
+        curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \
-        -H "Authorization: token ${GITEA_TOKEN}" \
+          -H "Authorization: token ${GITEA_TOKEN}" \
-        -H "Content-Type: application/json" \
+          -H "Content-Type: application/json" \
-        -d '{"Do": "squash"}' 2>&1) || true
+          -d '{"Do": "squash"}' >/dev/null 2>&1 || true
-      log "  PR #${pr_num} merge attempted"
+        curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \
          -H "Authorization: token ${GITEA_TOKEN}" \
          -H "Content-Type: application/json" \
          -d '{"state": "closed"}' >/dev/null 2>&1 || true
        log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed"
      fi
      consecutive_failures=0
    elif [ "$exit_code" -eq 124 ]; then
      log "WORKER-${worker_id}: TIMEOUT #${issue_num}"
      mark_skip "$issue_num" "timeout" 1
      consecutive_failures=$((consecutive_failures + 1))
      # Close the issue (Gitea auto-close via "Fixes #N" is unreliable)
      curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}" \
        -H "Authorization: token ${GITEA_TOKEN}" \
        -H "Content-Type: application/json" \
        -d '{"state": "closed"}' >/dev/null 2>&1 || true
      log "  Issue #${issue_num} closed"
    else
-      log "  WARN: No open PR found for branch ${branch}"
+      if grep -q "rate_limit\|rate limit\|429\|overloaded\|quota" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null; then
        log "WORKER-${worker_id}: RATE LIMITED on #${issue_num}"
        mark_skip "$issue_num" "rate_limit" 0.25
        consecutive_failures=$((consecutive_failures + 3))
      else
        log "WORKER-${worker_id}: FAILED #${issue_num} (exit ${exit_code})"
        mark_skip "$issue_num" "exit_code_${exit_code}" 1
        consecutive_failures=$((consecutive_failures + 1))
      fi
    fi
-    failure_count=0
+    cleanup_worktree "$worktree" "$branch"
-    issues_completed=$((issues_completed + 1))
+    unlock_issue "$issue_key"
-    log "Stats: ${issues_completed} issues completed this session"
+    update_active "$worker_id" "" "" "done"
  elif [ "$exit_code" -eq 124 ]; then
    log "TIMEOUT: #${issue_num} exceeded ${GEMINI_TIMEOUT}s"
    mark_skip "$issue_num" "timeout" 1
    failure_count=$((failure_count + 1))
  else
    log "FAILED: #${issue_num} exited with code ${exit_code}"
    mark_skip "$issue_num" "exit_code_${exit_code}" 1
    failure_count=$((failure_count + 1))
  fi
-  # Clean up worktree
+    sleep "$COOLDOWN"
-  cleanup_worktree "$worktree" "$branch"
+  done
 }
-  # Cooldown
+# === MAIN ===
-  log "Cooling down ${COOLDOWN}s before next issue..."
+log "=== Gemini Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
-  sleep "$COOLDOWN"
+log "Worktrees: ${WORKTREE_BASE}"
 rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
 # PID tracking via files (bash 3.2 compatible)
 PID_DIR="$LOG_DIR/gemini-pids"
 mkdir -p "$PID_DIR"
 rm -f "$PID_DIR"/*.pid 2>/dev/null
 launch_worker() {
  local wid="$1"
  run_worker "$wid" &
  echo $! > "$PID_DIR/${wid}.pid"
  log "Launched worker $wid (PID $!)"
 }
 for i in $(seq 1 "$NUM_WORKERS"); do
  launch_worker "$i"
  sleep 3
 done
 # Dynamic scaler — every 3 minutes
 CURRENT_WORKERS="$NUM_WORKERS"
 while true; do
  sleep 180
  # Reap dead workers
  for pidfile in "$PID_DIR"/*.pid; do
    [ -f "$pidfile" ] || continue
    wid=$(basename "$pidfile" .pid)
    wpid=$(cat "$pidfile")
    if ! kill -0 "$wpid" 2>/dev/null; then
      log "SCALER: Worker $wid died — relaunching"
      launch_worker "$wid"
      sleep 2
    fi
  done
  recent_rate_limits=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
  recent_successes=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
  if [ "$recent_rate_limits" -gt 0 ]; then
    if [ "$CURRENT_WORKERS" -gt 2 ]; then
      drop_to=$(( CURRENT_WORKERS / 2 ))
      [ "$drop_to" -lt 2 ] && drop_to=2
      log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to}"
      for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
        if [ -f "$PID_DIR/${wid}.pid" ]; then
          kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
          rm -f "$PID_DIR/${wid}.pid"
          update_active "$wid" "" "" "done"
        fi
      done
      CURRENT_WORKERS=$drop_to
    fi
  elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
    new_count=$(( CURRENT_WORKERS + 2 ))
    [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
    log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count}"
    for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
      launch_worker "$wid"
      sleep 2
    done
    CURRENT_WORKERS=$new_count
  fi
 done
--- a/bin/loop-watchdog.sh
+++ b/bin/loop-watchdog.sh
@@ -0,0 +1,251 @@
 #!/usr/bin/env bash
 # loop-watchdog.sh — Self-healing monitor for all agent loops
 # Runs every 2 minutes. Restarts dead loops, kills zombies,
 # and files Gitea issues for problems it can't auto-fix.
 #
 # Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh &
 set -uo pipefail
 LOG_DIR="$HOME/.hermes/logs"
 LOG="$LOG_DIR/watchdog.log"
 ISSUE_LOG="$LOG_DIR/watchdog-issues.json"  # tracks filed issues to avoid duplicates
 GITEA_URL="http://143.198.27.163:3000"
 ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)
 ISSUE_REPO="rockachopa/hermes-agent"  # ops issues go here
 CHECK_INTERVAL=120  # 2 minutes
 mkdir -p "$LOG_DIR"
 [ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG"
 log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"
 }
 # File a Gitea issue for problems that can't be auto-fixed.
 # Deduplicates: won't file the same issue_key within 6 hours.
 file_issue() {
  local issue_key="$1"
  local title="$2"
  local body="$3"
  local assignee="${4:-claude}"
  # Check if we already filed this recently
  local should_file
  should_file=$(python3 -c "
 import json, time
 try:
    with open('$ISSUE_LOG') as f: filed = json.load(f)
 except: filed = {}
 entry = filed.get('$issue_key', {})
 if entry and entry.get('until', 0) > time.time():
    print('no')
 else:
    filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''}
    with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)
    print('yes')
 " 2>/dev/null)
  if [ "$should_file" != "yes" ]; then
    return 0
  fi
  log "FILING ISSUE: $title"
  curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
    -H "Authorization: token ${ADMIN_TOKEN}" \
    -H "Content-Type: application/json" \
    -d "$(python3 -c "
 import json
 print(json.dumps({
    'title': '[watchdog] $title',
    'body': '''$body
 ---
 *Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''',
    'assignees': ['$assignee'],
 }))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
 }
 # === HEALTH CHECKS ===
 check_loop() {
  local name="$1"        # kimi | claude | gemini
  local grep_pat="$2"    # pattern to find the loop process
  local wake_cmd="$3"    # command to restart
  local log_file="$4"    # log to check for errors
  local worker_pat="${5:-}" # optional: pattern for worker processes
  local pid
  pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
  if [ -z "$pid" ]; then
    log "$name loop DOWN — restarting..."
    eval "$wake_cmd"
    sleep 3
    # Verify it came back
    pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
    if [ -z "$pid" ]; then
      file_issue \
        "${name}-loop-dead" \
        "${name} loop won't start" \
        "The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \
        "claude"
    else
      log "$name loop restarted (PID $pid)"
    fi
    return
  fi
  # Loop is running — check for stalls
  if [ -f "$log_file" ]; then
    local last_activity
    last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0)
    local now
    now=$(date +%s)
    local stale_seconds=$(( now - last_activity ))
    # If no log activity for 30 minutes, something is wrong
    if [ "$stale_seconds" -gt 1800 ]; then
      log "$name loop STALE — no activity for ${stale_seconds}s"
      # Check if it's just idle (empty queue) vs truly stuck
      local last_line
      last_line=$(tail -1 "$log_file" 2>/dev/null)
      if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then
        # Just idle, that's fine
        return
      fi
      # Kill and restart
      log "$name loop stuck — killing and restarting..."
      pkill -f "$grep_pat" 2>/dev/null
      [ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null
      sleep 2
      eval "$wake_cmd"
      sleep 3
      pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
      if [ -z "$pid" ]; then
        file_issue \
          "${name}-loop-stuck" \
          "${name} loop stuck and won't restart" \
          "The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \
          "claude"
      else
        log "$name loop recovered (PID $pid)"
      fi
    fi
    # Check for crash loops (5+ failures in last 50 lines)
    local recent_failures
    recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true)
    if [ "$recent_failures" -ge 5 ]; then
      local error_sample
      error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5)
      file_issue \
        "${name}-crash-loop" \
        "${name} agent in crash loop (${recent_failures} recent failures)" \
        "The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \
        "claude"
    fi
  fi
 }
 check_gitea() {
  if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then
    log "Gitea UNREACHABLE"
    file_issue \
      "gitea-down" \
      "Gitea instance unreachable" \
      "The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \
      "claude"
  fi
 }
 check_zombies() {
  local stuck_git
  stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
  local orphan_py
  orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')
  if [ "$stuck_git" -gt 3 ]; then
    log "Killing $stuck_git stuck git processes"
    pkill -f "git.*push\|git-remote-http" 2>/dev/null || true
  fi
  if [ "$orphan_py" -gt 3 ]; then
    log "Killing $orphan_py orphaned pytest processes"
    pkill -f "pytest tests/" 2>/dev/null || true
  fi
 }
 check_disk() {
  local worktree_count
  worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
  if [ "$worktree_count" -gt 30 ]; then
    log "WARN: $worktree_count worktrees — possible leak"
    file_issue \
      "worktree-leak" \
      "Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \
      "There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \
      "claude"
  fi
 }
 check_skip_lists() {
  # If all agents have full skip lists, the whole system is stuck
  for agent in claude gemini kimi; do
    local skip_file="$LOG_DIR/${agent}-skip-list.json"
    [ -f "$skip_file" ] || continue
    local skip_count
    skip_count=$(python3 -c "
 import json, time
 try:
    with open('$skip_file') as f: skips = json.load(f)
    active = sum(1 for v in skips.values() if v.get('until',0) > time.time())
    print(active)
 except: print(0)
 " 2>/dev/null)
    if [ "${skip_count:-0}" -gt 10 ]; then
      file_issue \
        "${agent}-skip-overload" \
        "${agent} has ${skip_count} skipped issues — systemic failure" \
        "The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \
        "claude"
    fi
  done
 }
 # === MAIN ===
 log "=== Watchdog Started ==="
 while true; do
  # Gitea must be up for anything to work
  check_gitea
  # Check each agent loop
  check_loop "kimi" "kimi-loop.sh" \
    "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \
    "$LOG_DIR/kimi-loop.log" \
    "kimi.*--print"
  check_loop "claude" "claude-loop.sh" \
    "nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \
    "$LOG_DIR/claude-loop.log" \
    "claude.*--print.*--dangerously"
  check_loop "gemini" "gemini-loop.sh" \
    "nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \
    "$LOG_DIR/gemini-loop.log" \
    "gemini.*-p"
  # Housekeeping
  check_zombies
  check_disk
  check_skip_lists
  sleep "$CHECK_INTERVAL"
 done
--- a/bin/ops-helpers.sh
+++ b/bin/ops-helpers.sh
@@ -35,6 +35,10 @@ ops-help() {
    echo "    ops-kill-gemini    Stop Gemini loop"
    echo "    ops-kill-zombies   Kill stuck git/pytest"
    echo ""
    echo -e "  \033[1mWatchdog\033[0m"
    echo "    ops-wake-watchdog  Start loop watchdog"
    echo "    ops-kill-watchdog  Stop loop watchdog"
    echo ""
    echo -e "  \033[2m  Type ops-help to see this again\033[0m"
    echo ""
 }
@@ -196,3 +200,15 @@ ops-kill-zombies() {
    done
    echo "  Killed $killed zombie processes"
 }
 ops-wake-watchdog() {
    pkill -f "loop-watchdog.sh" 2>/dev/null
    sleep 1
    nohup bash ~/.hermes/bin/loop-watchdog.sh >> ~/.hermes/logs/watchdog.log 2>&1 &
    echo "  Watchdog started (PID $!)"
 }
 ops-kill-watchdog() {
    pkill -f "loop-watchdog.sh" 2>/dev/null
    echo "  Watchdog stopped"
 }