From 5f8129d346ae0bfdc136df94109cce1d1313e91e Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sun, 22 Mar 2026 19:22:18 -0400 Subject: [PATCH] feat: parallel workers for all agents, dynamic scaling, self-healing watchdog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + bin/claude-loop.sh | 74 +++++- bin/gemini-loop.sh | 538 ++++++++++++++++++++++++++----------------- bin/loop-watchdog.sh | 251 ++++++++++++++++++++ bin/ops-helpers.sh | 16 ++ 5 files changed, 660 insertions(+), 220 deletions(-) create mode 100644 bin/loop-watchdog.sh diff --git a/.gitignore b/.gitignore index 6d702bf..7be33a3 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ bin/* !bin/timmy-loopstat.sh !bin/start-dashboard.sh !bin/gemini-loop.sh +!bin/loop-watchdog.sh # ── Queue (transient task queue) ───────────────────────────────────── queue/ diff --git a/bin/claude-loop.sh b/bin/claude-loop.sh index 519b318..c4d2450 100755 --- a/bin/claude-loop.sh +++ b/bin/claude-loop.sh @@ -8,12 +8,13 @@ set -euo pipefail # === CONFIG === -NUM_WORKERS="${1:-3}" +NUM_WORKERS="${1:-7}" +MAX_WORKERS=21 # absolute ceiling WORKTREE_BASE="$HOME/worktrees" GITEA_URL="http://143.198.27.163:3000" GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token") CLAUDE_TIMEOUT=900 # 15 min per issue -COOLDOWN=15 # seconds between launching workers +COOLDOWN=5 # seconds between issues (fast cycle) RATE_LIMIT_SLEEP=60 # initial sleep on rate limit MAX_RATE_SLEEP=300 # max backoff on rate limit LOG_DIR="$HOME/.hermes/logs" @@ -402,18 +403,73 @@ else: print('') } # === MAIN === -log "=== Claude Loop Started — ${NUM_WORKERS} workers ===" +log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ===" log "Worktrees: ${WORKTREE_BASE}" # Clean stale locks rm -rf "$LOCK_DIR"/*.lock 2>/dev/null -# Launch workers +# PID tracking via files (bash 3.2 compatible) +PID_DIR="$LOG_DIR/claude-pids" +mkdir -p "$PID_DIR" +rm -f "$PID_DIR"/*.pid 2>/dev/null + +launch_worker() { + local wid="$1" + run_worker "$wid" & + echo $! > "$PID_DIR/${wid}.pid" + log "Launched worker $wid (PID $!)" +} + +# Initial launch for i in $(seq 1 "$NUM_WORKERS"); do - run_worker "$i" & - log "Launched worker $i (PID $!)" - sleep 5 # stagger starts + launch_worker "$i" + sleep 3 done -# Wait for all workers -wait +# === DYNAMIC SCALER === +# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits +CURRENT_WORKERS="$NUM_WORKERS" +while true; do + sleep 180 + + # Reap dead workers and relaunch + for pidfile in "$PID_DIR"/*.pid; do + [ -f "$pidfile" ] || continue + wid=$(basename "$pidfile" .pid) + wpid=$(cat "$pidfile") + if ! kill -0 "$wpid" 2>/dev/null; then + log "SCALER: Worker $wid died — relaunching" + launch_worker "$wid" + sleep 2 + fi + done + + recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true) + recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true) + + if [ "$recent_rate_limits" -gt 0 ]; then + if [ "$CURRENT_WORKERS" -gt 2 ]; then + drop_to=$(( CURRENT_WORKERS / 2 )) + [ "$drop_to" -lt 2 ] && drop_to=2 + log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers" + for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do + if [ -f "$PID_DIR/${wid}.pid" ]; then + kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true + rm -f "$PID_DIR/${wid}.pid" + update_active "$wid" "" "" "done" + fi + done + CURRENT_WORKERS=$drop_to + fi + elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then + new_count=$(( CURRENT_WORKERS + 2 )) + [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS + log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers" + for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do + launch_worker "$wid" + sleep 2 + done + CURRENT_WORKERS=$new_count + fi +done diff --git a/bin/gemini-loop.sh b/bin/gemini-loop.sh index 9fd2a8b..505ef42 100755 --- a/bin/gemini-loop.sh +++ b/bin/gemini-loop.sh @@ -1,321 +1,437 @@ #!/usr/bin/env bash -# gemini-loop.sh — Dropout-proof Gemini code agent dispatch loop -# Picks an open issue from Gitea, creates a worktree, runs Gemini Code CLI, -# handles failures gracefully, and loops forever. +# gemini-loop.sh — Parallel Gemini Code agent dispatch loop +# Runs N workers concurrently against the Gitea backlog. +# Dynamic scaling: starts at N, scales up to MAX, drops on rate limits. # -# Dropout-proof means: -# - If Gemini Code crashes/hangs, we kill it and move on -# - If worktree creation fails, skip and retry -# - If push fails, log and continue -# - Exponential backoff on repeated failures -# - Clean up worktrees after PR is created +# Usage: gemini-loop.sh [NUM_WORKERS] (default: 3) set -euo pipefail # === CONFIG === -REPO_DIR="$HOME/worktrees/gemini-repo" +NUM_WORKERS="${1:-3}" +MAX_WORKERS=12 WORKTREE_BASE="$HOME/worktrees" GITEA_URL="http://143.198.27.163:3000" GITEA_TOKEN=$(cat "$HOME/.hermes/gemini_token") -REPO_OWNER="rockachopa" -REPO_NAME="Timmy-time-dashboard" GEMINI_TIMEOUT=600 # 10 min per issue -COOLDOWN=30 # seconds between issues -MAX_FAILURES=5 # consecutive failures before long sleep -LONG_SLEEP=300 # 5 min backoff on repeated failures +COOLDOWN=5 # seconds between issues +RATE_LIMIT_SLEEP=60 +MAX_RATE_SLEEP=300 LOG_DIR="$HOME/.hermes/logs" -SKIP_FILE="$LOG_DIR/gemini-skip-list.json" # issues to skip temporarily +SKIP_FILE="$LOG_DIR/gemini-skip-list.json" +LOCK_DIR="$LOG_DIR/gemini-locks" +ACTIVE_FILE="$LOG_DIR/gemini-active.json" -mkdir -p "$LOG_DIR" "$WORKTREE_BASE" - -# Initialize skip file if missing +mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR" [ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE" +echo '{}' > "$ACTIVE_FILE" -# === STATE === -failure_count=0 -issues_completed=0 +# === SHARED FUNCTIONS === +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_DIR/gemini-loop.log" +} -# === SKIP LIST FUNCTIONS === -is_skipped() { - local issue_num="$1" - python3 -c " -import json, time, sys -try: - with open('$SKIP_FILE') as f: skips = json.load(f) -except: skips = {} -entry = skips.get(str($issue_num), {}) -if entry and entry.get('until', 0) > time.time(): - print('skip') - sys.exit(0) -# Expired or not found — clean up and allow -if str($issue_num) in skips: - del skips[str($issue_num)] - with open('$SKIP_FILE', 'w') as f: json.dump(skips, f) -print('ok') -" 2>/dev/null +lock_issue() { + local issue_key="$1" + local lockfile="$LOCK_DIR/$issue_key.lock" + if mkdir "$lockfile" 2>/dev/null; then + echo $$ > "$lockfile/pid" + return 0 + fi + return 1 +} + +unlock_issue() { + rm -rf "$LOCK_DIR/$1.lock" 2>/dev/null } mark_skip() { - local issue_num="$1" - local reason="$2" - local skip_hours="${3:-1}" # default 1 hour + local issue_num="$1" reason="$2" skip_hours="${3:-1}" python3 -c " -import json, time -try: - with open('$SKIP_FILE') as f: skips = json.load(f) -except: skips = {} -skips[str($issue_num)] = { - 'until': time.time() + ($skip_hours * 3600), - 'reason': '$reason', - 'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1 -} -# If 3+ failures, skip for 6 hours instead -if skips[str($issue_num)]['failures'] >= 3: - skips[str($issue_num)]['until'] = time.time() + (6 * 3600) -with open('$SKIP_FILE', 'w') as f: json.dump(skips, f, indent=2) +import json, time, fcntl +with open('$SKIP_FILE', 'r+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: skips = json.load(f) + except: skips = {} + skips[str($issue_num)] = { + 'until': time.time() + ($skip_hours * 3600), + 'reason': '$reason', + 'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1 + } + if skips[str($issue_num)]['failures'] >= 3: + skips[str($issue_num)]['until'] = time.time() + (6 * 3600) + f.seek(0) + f.truncate() + json.dump(skips, f, indent=2) " 2>/dev/null - log "SKIP: #${issue_num} added to skip list — ${reason}" + log "SKIP: #${issue_num} — ${reason}" } -log() { - local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" - echo "$msg" >> "$LOG_DIR/gemini-loop.log" +update_active() { + local worker="$1" issue="$2" repo="$3" status="$4" + python3 -c " +import json, fcntl +with open('$ACTIVE_FILE', 'r+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: active = json.load(f) + except: active = {} + if '$status' == 'done': + active.pop('$worker', None) + else: + active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'} + f.seek(0) + f.truncate() + json.dump(active, f, indent=2) +" 2>/dev/null } cleanup_worktree() { - local wt="$1" - local branch="$2" + local wt="$1" branch="$2" if [ -d "$wt" ]; then - cd "$REPO_DIR" + local parent + parent=$(git -C "$wt" rev-parse --git-common-dir 2>/dev/null | sed 's|/.git$||' || true) + [ -n "$parent" ] && [ -d "$parent" ] && cd "$parent" git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt" git worktree prune 2>/dev/null git branch -D "$branch" 2>/dev/null || true - log "Cleaned up worktree: $wt" fi } get_next_issue() { - # Get open issues ASSIGNED TO GEMINI only — Gemini works its own queue - # NOTE: Gitea's assignee filter is unreliable — we validate in Python - local skip_file="$SKIP_FILE" - curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues?state=open&type=issues&limit=50&sort=created" \ - -H "Authorization: token ${GITEA_TOKEN}" | python3 -c " -import sys, json, time + python3 -c " +import json, sys, time, urllib.request, os -issues = json.load(sys.stdin) -# Reverse to oldest-first (Gitea returns newest-first) — respects dependency order -issues.reverse() +token = '${GITEA_TOKEN}' +base = '${GITEA_URL}' +repos = [ + 'rockachopa/Timmy-time-dashboard', + 'rockachopa/alexanderwhitestone.com', + 'rockachopa/hermes-agent', + 'replit/timmy-tower', + 'replit/token-gated-economy', +] -# Load skip list try: - with open('${skip_file}') as f: skips = json.load(f) + with open('${SKIP_FILE}') as f: skips = json.load(f) except: skips = {} -for i in issues: - # MUST be assigned to gemini (Gitea filter is broken, validate here) +try: + with open('${ACTIVE_FILE}') as f: + active = json.load(f) + active_issues = {v['issue'] for v in active.values()} +except: + active_issues = set() + +all_issues = [] +for repo in repos: + url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created' + req = urllib.request.Request(url, headers={'Authorization': f'token {token}'}) + try: + resp = urllib.request.urlopen(req, timeout=10) + issues = json.loads(resp.read()) + for i in issues: + i['_repo'] = repo + all_issues.extend(issues) + except: + continue + +def priority(i): + t = i['title'].lower() + if '[urgent]' in t or 'urgent:' in t: return 0 + if '[p0]' in t: return 1 + if '[p1]' in t: return 2 + if '[bug]' in t: return 3 + if 'lhf:' in t or 'lhf ' in t: return 4 + if '[p2]' in t: return 5 + return 6 + +all_issues.sort(key=priority) + +for i in all_issues: assignees = [a['login'] for a in (i.get('assignees') or [])] if 'gemini' not in assignees: continue title = i['title'].lower() - # Skip philosophy, epics, showcases, features (not 10-min code work) if '[philosophy]' in title: continue if '[epic]' in title or 'epic:' in title: continue if '[showcase]' in title: continue - if '[feature]' in title: continue - # Check skip list num_str = str(i['number']) - entry = skips.get(num_str, {}) - if entry and entry.get('until', 0) > time.time(): - continue + if num_str in active_issues: continue - print(json.dumps({'number': i['number'], 'title': i['title']})) + entry = skips.get(num_str, {}) + if entry and entry.get('until', 0) > time.time(): continue + + lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock' + if os.path.isdir(lock): continue + + repo = i['_repo'] + owner, name = repo.split('/') + print(json.dumps({ + 'number': i['number'], + 'title': i['title'], + 'repo_owner': owner, + 'repo_name': name, + 'repo': repo, + })) sys.exit(0) + print('null') " 2>/dev/null } build_prompt() { - local issue_num="$1" - local issue_title="$2" - local worktree="$3" - + local issue_num="$1" issue_title="$2" worktree="$3" repo_owner="$4" repo_name="$5" cat < (#${issue_num})", "body": "Fixes #${issue_num}\n\n", "head": "gemini/issue-${issue_num}", "base": "main"}' 5. COMMENT on the issue when done: - curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}/comments" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ + curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\ + -H "Authorization: token ${GITEA_TOKEN}" \\ + -H "Content-Type: application/json" \\ -d '{"body": "PR created. "}' -6. FILE NEW ISSUES if you find bugs, missing tests, or improvements while working: - curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"title": "[gemini-generated] ", "body": "<description>"}' - == RULES == - Read CLAUDE.md or project README first for conventions -- tox is the ONLY way to run tests/lint/format. Never run pytest/ruff directly. +- If the project has tox, use tox. If npm, use npm. Follow the project. - Never use --no-verify on git commands. - If tests fail after 2 attempts, STOP and comment on the issue explaining why. -- Be thorough. If you see something broken nearby, file an issue for it. +- Be thorough but focused. Fix the issue, don't refactor the world. PROMPT } -# === MAIN LOOP === -log "=== Gemini Loop Started ===" -log "Repo: ${REPO_DIR}" -log "Worktrees: ${WORKTREE_BASE}" +# === WORKER FUNCTION === +run_worker() { + local worker_id="$1" + local consecutive_failures=0 -while true; do - # Check for too many consecutive failures - if [ "$failure_count" -ge "$MAX_FAILURES" ]; then - log "BACKOFF: ${failure_count} consecutive failures. Sleeping ${LONG_SLEEP}s..." - sleep "$LONG_SLEEP" - failure_count=0 - fi + log "WORKER-${worker_id}: Started" - # Fetch latest main (resilient — never die on git errors) - cd "$REPO_DIR" - timeout 60 git fetch origin main 2>/dev/null || { log "WARN: git fetch failed, continuing anyway"; } - git checkout main 2>/dev/null || true - git reset --hard origin/main 2>/dev/null || true - - # Get next issue - issue_json=$(get_next_issue) - - if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then - # Only log idle ONCE, then go quiet until work appears - if [ "${LAST_STATE:-}" != "idle" ]; then - log "Queue empty. Waiting for assignments..." - LAST_STATE="idle" + while true; do + if [ "$consecutive_failures" -ge 5 ]; then + local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5))) + [ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP + log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)" + sleep "$backoff" + consecutive_failures=0 fi - sleep "$LONG_SLEEP" - continue - fi - LAST_STATE="working" - issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])") - issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])") - branch="gemini/issue-${issue_num}" - worktree="${WORKTREE_BASE}/gemini-${issue_num}" + issue_json=$(get_next_issue) - log "=== ISSUE #${issue_num}: ${issue_title} ===" + if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then + update_active "$worker_id" "" "" "idle" + sleep 60 + continue + fi - # Create worktree - if [ -d "$worktree" ]; then - log "Worktree already exists, cleaning..." - cleanup_worktree "$worktree" "$branch" - fi + issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])") + issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])") + repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])") + repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])") + issue_key="${repo_owner}-${repo_name}-${issue_num}" + branch="gemini/issue-${issue_num}" + worktree="${WORKTREE_BASE}/gemini-w${worker_id}-${issue_num}" - cd "$REPO_DIR" - if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then - log "ERROR: Failed to create worktree for #${issue_num}" - failure_count=$((failure_count + 1)) - sleep "$COOLDOWN" - continue - fi + if ! lock_issue "$issue_key"; then + sleep 5 + continue + fi - # Configure git remote with gemini's token so it can push - cd "$worktree" - git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${REPO_OWNER}/${REPO_NAME}.git" - cd "$REPO_DIR" + log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ===" + update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working" - # Build prompt - prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree") + # Ensure local clone + local_repo="${WORKTREE_BASE}/gemini-base-${repo_owner}-${repo_name}" + if [ ! -d "$local_repo" ]; then + log "WORKER-${worker_id}: Cloning ${repo_owner}/${repo_name}..." + git clone --depth=1 "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" "$local_repo" 2>&1 || { + log "WORKER-${worker_id}: ERROR cloning" + unlock_issue "$issue_key" + consecutive_failures=$((consecutive_failures + 1)) + sleep "$COOLDOWN" + continue + } + cd "$local_repo" + git fetch --unshallow origin main 2>/dev/null || true + fi - # Run Gemini Code CLI with timeout - log "Launching Gemini Code for #${issue_num} (timeout: ${GEMINI_TIMEOUT}s)..." + cd "$local_repo" + timeout 60 git fetch origin main 2>/dev/null || true + git checkout main 2>/dev/null || true + git reset --hard origin/main 2>/dev/null || true - set +e - cd "$worktree" - gtimeout "$GEMINI_TIMEOUT" gemini \ - --print \ - --quiet \ - -w "$worktree" \ - -p "$prompt" \ - </dev/null 2>&1 | tee "$LOG_DIR/gemini-${issue_num}.log" - exit_code=${PIPESTATUS[0]} - cd "$REPO_DIR" - set -e + [ -d "$worktree" ] && cleanup_worktree "$worktree" "$branch" + cd "$local_repo" - if [ "$exit_code" -eq 0 ]; then - log "SUCCESS: #${issue_num} completed — attempting auto-merge..." + if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then + log "WORKER-${worker_id}: ERROR creating worktree" + unlock_issue "$issue_key" + consecutive_failures=$((consecutive_failures + 1)) + sleep "$COOLDOWN" + continue + fi - # Find and merge the PR gemini created - pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls?state=open&head=${REPO_OWNER}:${branch}&limit=1" \ - -H "Authorization: token ${GITEA_TOKEN}" | python3 -c " + cd "$worktree" + git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" + + prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name") + + log "WORKER-${worker_id}: Launching Gemini Code for #${issue_num}..." + + set +e + cd "$worktree" + gtimeout "$GEMINI_TIMEOUT" gemini \ + -p "$prompt" \ + --yolo \ + </dev/null >> "$LOG_DIR/gemini-${issue_num}.log" 2>&1 + exit_code=$? + set -e + + if [ "$exit_code" -eq 0 ]; then + log "WORKER-${worker_id}: SUCCESS #${issue_num}" + + pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \ + -H "Authorization: token ${GITEA_TOKEN}" | python3 -c " import sys,json prs = json.load(sys.stdin) if prs: print(prs[0]['number']) else: print('') " 2>/dev/null) - if [ -n "$pr_num" ]; then - merge_result=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls/${pr_num}/merge" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"Do": "squash"}' 2>&1) || true - log " PR #${pr_num} merge attempted" + if [ -n "$pr_num" ]; then + curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"Do": "squash"}' >/dev/null 2>&1 || true + curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"state": "closed"}' >/dev/null 2>&1 || true + log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed" + fi + + consecutive_failures=0 + + elif [ "$exit_code" -eq 124 ]; then + log "WORKER-${worker_id}: TIMEOUT #${issue_num}" + mark_skip "$issue_num" "timeout" 1 + consecutive_failures=$((consecutive_failures + 1)) - # Close the issue (Gitea auto-close via "Fixes #N" is unreliable) - curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}" \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Content-Type: application/json" \ - -d '{"state": "closed"}' >/dev/null 2>&1 || true - log " Issue #${issue_num} closed" else - log " WARN: No open PR found for branch ${branch}" + if grep -q "rate_limit\|rate limit\|429\|overloaded\|quota" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null; then + log "WORKER-${worker_id}: RATE LIMITED on #${issue_num}" + mark_skip "$issue_num" "rate_limit" 0.25 + consecutive_failures=$((consecutive_failures + 3)) + else + log "WORKER-${worker_id}: FAILED #${issue_num} (exit ${exit_code})" + mark_skip "$issue_num" "exit_code_${exit_code}" 1 + consecutive_failures=$((consecutive_failures + 1)) + fi fi - failure_count=0 - issues_completed=$((issues_completed + 1)) - log "Stats: ${issues_completed} issues completed this session" - elif [ "$exit_code" -eq 124 ]; then - log "TIMEOUT: #${issue_num} exceeded ${GEMINI_TIMEOUT}s" - mark_skip "$issue_num" "timeout" 1 - failure_count=$((failure_count + 1)) - else - log "FAILED: #${issue_num} exited with code ${exit_code}" - mark_skip "$issue_num" "exit_code_${exit_code}" 1 - failure_count=$((failure_count + 1)) - fi + cleanup_worktree "$worktree" "$branch" + unlock_issue "$issue_key" + update_active "$worker_id" "" "" "done" - # Clean up worktree - cleanup_worktree "$worktree" "$branch" + sleep "$COOLDOWN" + done +} - # Cooldown - log "Cooling down ${COOLDOWN}s before next issue..." - sleep "$COOLDOWN" +# === MAIN === +log "=== Gemini Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ===" +log "Worktrees: ${WORKTREE_BASE}" + +rm -rf "$LOCK_DIR"/*.lock 2>/dev/null + +# PID tracking via files (bash 3.2 compatible) +PID_DIR="$LOG_DIR/gemini-pids" +mkdir -p "$PID_DIR" +rm -f "$PID_DIR"/*.pid 2>/dev/null + +launch_worker() { + local wid="$1" + run_worker "$wid" & + echo $! > "$PID_DIR/${wid}.pid" + log "Launched worker $wid (PID $!)" +} + +for i in $(seq 1 "$NUM_WORKERS"); do + launch_worker "$i" + sleep 3 +done + +# Dynamic scaler — every 3 minutes +CURRENT_WORKERS="$NUM_WORKERS" +while true; do + sleep 180 + + # Reap dead workers + for pidfile in "$PID_DIR"/*.pid; do + [ -f "$pidfile" ] || continue + wid=$(basename "$pidfile" .pid) + wpid=$(cat "$pidfile") + if ! kill -0 "$wpid" 2>/dev/null; then + log "SCALER: Worker $wid died — relaunching" + launch_worker "$wid" + sleep 2 + fi + done + + recent_rate_limits=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true) + recent_successes=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "SUCCESS" || true) + + if [ "$recent_rate_limits" -gt 0 ]; then + if [ "$CURRENT_WORKERS" -gt 2 ]; then + drop_to=$(( CURRENT_WORKERS / 2 )) + [ "$drop_to" -lt 2 ] && drop_to=2 + log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to}" + for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do + if [ -f "$PID_DIR/${wid}.pid" ]; then + kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true + rm -f "$PID_DIR/${wid}.pid" + update_active "$wid" "" "" "done" + fi + done + CURRENT_WORKERS=$drop_to + fi + elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then + new_count=$(( CURRENT_WORKERS + 2 )) + [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS + log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count}" + for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do + launch_worker "$wid" + sleep 2 + done + CURRENT_WORKERS=$new_count + fi done diff --git a/bin/loop-watchdog.sh b/bin/loop-watchdog.sh new file mode 100644 index 0000000..c56274e --- /dev/null +++ b/bin/loop-watchdog.sh @@ -0,0 +1,251 @@ +#!/usr/bin/env bash +# loop-watchdog.sh — Self-healing monitor for all agent loops +# Runs every 2 minutes. Restarts dead loops, kills zombies, +# and files Gitea issues for problems it can't auto-fix. +# +# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh & + +set -uo pipefail + +LOG_DIR="$HOME/.hermes/logs" +LOG="$LOG_DIR/watchdog.log" +ISSUE_LOG="$LOG_DIR/watchdog-issues.json" # tracks filed issues to avoid duplicates +GITEA_URL="http://143.198.27.163:3000" +ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null) +ISSUE_REPO="rockachopa/hermes-agent" # ops issues go here +CHECK_INTERVAL=120 # 2 minutes + +mkdir -p "$LOG_DIR" +[ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG" +} + +# File a Gitea issue for problems that can't be auto-fixed. +# Deduplicates: won't file the same issue_key within 6 hours. +file_issue() { + local issue_key="$1" + local title="$2" + local body="$3" + local assignee="${4:-claude}" + + # Check if we already filed this recently + local should_file + should_file=$(python3 -c " +import json, time +try: + with open('$ISSUE_LOG') as f: filed = json.load(f) +except: filed = {} +entry = filed.get('$issue_key', {}) +if entry and entry.get('until', 0) > time.time(): + print('no') +else: + filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''} + with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2) + print('yes') +" 2>/dev/null) + + if [ "$should_file" != "yes" ]; then + return 0 + fi + + log "FILING ISSUE: $title" + curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \ + -H "Authorization: token ${ADMIN_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "$(python3 -c " +import json +print(json.dumps({ + 'title': '[watchdog] $title', + 'body': '''$body + +--- +*Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''', + 'assignees': ['$assignee'], +}))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title" +} + +# === HEALTH CHECKS === + +check_loop() { + local name="$1" # kimi | claude | gemini + local grep_pat="$2" # pattern to find the loop process + local wake_cmd="$3" # command to restart + local log_file="$4" # log to check for errors + local worker_pat="${5:-}" # optional: pattern for worker processes + + local pid + pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) + + if [ -z "$pid" ]; then + log "$name loop DOWN — restarting..." + eval "$wake_cmd" + sleep 3 + + # Verify it came back + pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) + if [ -z "$pid" ]; then + file_issue \ + "${name}-loop-dead" \ + "${name} loop won't start" \ + "The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \ + "claude" + else + log "$name loop restarted (PID $pid)" + fi + return + fi + + # Loop is running — check for stalls + if [ -f "$log_file" ]; then + local last_activity + last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0) + local now + now=$(date +%s) + local stale_seconds=$(( now - last_activity )) + + # If no log activity for 30 minutes, something is wrong + if [ "$stale_seconds" -gt 1800 ]; then + log "$name loop STALE — no activity for ${stale_seconds}s" + + # Check if it's just idle (empty queue) vs truly stuck + local last_line + last_line=$(tail -1 "$log_file" 2>/dev/null) + if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then + # Just idle, that's fine + return + fi + + # Kill and restart + log "$name loop stuck — killing and restarting..." + pkill -f "$grep_pat" 2>/dev/null + [ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null + sleep 2 + eval "$wake_cmd" + sleep 3 + + pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) + if [ -z "$pid" ]; then + file_issue \ + "${name}-loop-stuck" \ + "${name} loop stuck and won't restart" \ + "The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \ + "claude" + else + log "$name loop recovered (PID $pid)" + fi + fi + + # Check for crash loops (5+ failures in last 50 lines) + local recent_failures + recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true) + if [ "$recent_failures" -ge 5 ]; then + local error_sample + error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5) + file_issue \ + "${name}-crash-loop" \ + "${name} agent in crash loop (${recent_failures} recent failures)" \ + "The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \ + "claude" + fi + fi +} + +check_gitea() { + if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then + log "Gitea UNREACHABLE" + file_issue \ + "gitea-down" \ + "Gitea instance unreachable" \ + "The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \ + "claude" + fi +} + +check_zombies() { + local stuck_git + stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ') + local orphan_py + orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ') + + if [ "$stuck_git" -gt 3 ]; then + log "Killing $stuck_git stuck git processes" + pkill -f "git.*push\|git-remote-http" 2>/dev/null || true + fi + + if [ "$orphan_py" -gt 3 ]; then + log "Killing $orphan_py orphaned pytest processes" + pkill -f "pytest tests/" 2>/dev/null || true + fi +} + +check_disk() { + local worktree_count + worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ') + + if [ "$worktree_count" -gt 30 ]; then + log "WARN: $worktree_count worktrees — possible leak" + file_issue \ + "worktree-leak" \ + "Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \ + "There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \ + "claude" + fi +} + +check_skip_lists() { + # If all agents have full skip lists, the whole system is stuck + for agent in claude gemini kimi; do + local skip_file="$LOG_DIR/${agent}-skip-list.json" + [ -f "$skip_file" ] || continue + local skip_count + skip_count=$(python3 -c " +import json, time +try: + with open('$skip_file') as f: skips = json.load(f) + active = sum(1 for v in skips.values() if v.get('until',0) > time.time()) + print(active) +except: print(0) +" 2>/dev/null) + + if [ "${skip_count:-0}" -gt 10 ]; then + file_issue \ + "${agent}-skip-overload" \ + "${agent} has ${skip_count} skipped issues — systemic failure" \ + "The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \ + "claude" + fi + done +} + +# === MAIN === +log "=== Watchdog Started ===" + +while true; do + # Gitea must be up for anything to work + check_gitea + + # Check each agent loop + check_loop "kimi" "kimi-loop.sh" \ + "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \ + "$LOG_DIR/kimi-loop.log" \ + "kimi.*--print" + + check_loop "claude" "claude-loop.sh" \ + "nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \ + "$LOG_DIR/claude-loop.log" \ + "claude.*--print.*--dangerously" + + check_loop "gemini" "gemini-loop.sh" \ + "nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \ + "$LOG_DIR/gemini-loop.log" \ + "gemini.*-p" + + # Housekeeping + check_zombies + check_disk + check_skip_lists + + sleep "$CHECK_INTERVAL" +done diff --git a/bin/ops-helpers.sh b/bin/ops-helpers.sh index 4fde358..c608861 100755 --- a/bin/ops-helpers.sh +++ b/bin/ops-helpers.sh @@ -35,6 +35,10 @@ ops-help() { echo " ops-kill-gemini Stop Gemini loop" echo " ops-kill-zombies Kill stuck git/pytest" echo "" + echo -e " \033[1mWatchdog\033[0m" + echo " ops-wake-watchdog Start loop watchdog" + echo " ops-kill-watchdog Stop loop watchdog" + echo "" echo -e " \033[2m Type ops-help to see this again\033[0m" echo "" } @@ -196,3 +200,15 @@ ops-kill-zombies() { done echo " Killed $killed zombie processes" } + +ops-wake-watchdog() { + pkill -f "loop-watchdog.sh" 2>/dev/null + sleep 1 + nohup bash ~/.hermes/bin/loop-watchdog.sh >> ~/.hermes/logs/watchdog.log 2>&1 & + echo " Watchdog started (PID $!)" +} + +ops-kill-watchdog() { + pkill -f "loop-watchdog.sh" 2>/dev/null + echo " Watchdog stopped" +}