diff --git a/bin/claude-loop.sh b/bin/claude-loop.sh new file mode 100755 index 00000000..5350e4d1 --- /dev/null +++ b/bin/claude-loop.sh @@ -0,0 +1,620 @@ +#!/usr/bin/env bash +# claude-loop.sh — Parallel Claude Code agent dispatch loop +# Runs N workers concurrently against the Gitea backlog. +# Gracefully handles rate limits with backoff. +# +# Usage: claude-loop.sh [NUM_WORKERS] (default: 2) + +set -euo pipefail + +# === CONFIG === +NUM_WORKERS="${1:-2}" +MAX_WORKERS=10 # absolute ceiling +WORKTREE_BASE="$HOME/worktrees" +GITEA_URL="http://143.198.27.163:3000" +GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token") +CLAUDE_TIMEOUT=900 # 15 min per issue +COOLDOWN=15 # seconds between issues — stagger clones +RATE_LIMIT_SLEEP=30 # initial sleep on rate limit +MAX_RATE_SLEEP=120 # max backoff on rate limit +LOG_DIR="$HOME/.hermes/logs" +SKIP_FILE="$LOG_DIR/claude-skip-list.json" +LOCK_DIR="$LOG_DIR/claude-locks" +ACTIVE_FILE="$LOG_DIR/claude-active.json" + +mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR" + +# Initialize files +[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE" +echo '{}' > "$ACTIVE_FILE" + +# === SHARED FUNCTIONS === +log() { + local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*" + echo "$msg" >> "$LOG_DIR/claude-loop.log" +} + +lock_issue() { + local issue_key="$1" + local lockfile="$LOCK_DIR/$issue_key.lock" + if mkdir "$lockfile" 2>/dev/null; then + echo $$ > "$lockfile/pid" + return 0 + fi + return 1 +} + +unlock_issue() { + local issue_key="$1" + rm -rf "$LOCK_DIR/$issue_key.lock" 2>/dev/null +} + +mark_skip() { + local issue_num="$1" + local reason="$2" + local skip_hours="${3:-1}" + python3 -c " +import json, time, fcntl +with open('$SKIP_FILE', 'r+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: skips = json.load(f) + except: skips = {} + skips[str($issue_num)] = { + 'until': time.time() + ($skip_hours * 3600), + 'reason': '$reason', + 'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1 + } + if skips[str($issue_num)]['failures'] >= 3: + skips[str($issue_num)]['until'] = time.time() + (6 * 3600) + f.seek(0) + f.truncate() + json.dump(skips, f, indent=2) +" 2>/dev/null + log "SKIP: #${issue_num} — ${reason}" +} + +update_active() { + local worker="$1" issue="$2" repo="$3" status="$4" + python3 -c " +import json, fcntl +with open('$ACTIVE_FILE', 'r+') as f: + fcntl.flock(f, fcntl.LOCK_EX) + try: active = json.load(f) + except: active = {} + if '$status' == 'done': + active.pop('$worker', None) + else: + active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'} + f.seek(0) + f.truncate() + json.dump(active, f, indent=2) +" 2>/dev/null +} + +cleanup_workdir() { + local wt="$1" + rm -rf "$wt" 2>/dev/null || true +} + +get_next_issue() { + python3 -c " +import json, sys, time, urllib.request, os + +token = '${GITEA_TOKEN}' +base = '${GITEA_URL}' +repos = [ + 'Timmy_Foundation/the-nexus', + 'Timmy_Foundation/autolora', +] + +# Load skip list +try: + with open('${SKIP_FILE}') as f: skips = json.load(f) +except: skips = {} + +# Load active issues (to avoid double-picking) +try: + with open('${ACTIVE_FILE}') as f: + active = json.load(f) + active_issues = {v['issue'] for v in active.values()} +except: + active_issues = set() + +all_issues = [] +for repo in repos: + url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created' + req = urllib.request.Request(url, headers={'Authorization': f'token {token}'}) + try: + resp = urllib.request.urlopen(req, timeout=10) + issues = json.loads(resp.read()) + for i in issues: + i['_repo'] = repo + all_issues.extend(issues) + except: + continue + +# Sort by priority: URGENT > P0 > P1 > bugs > LHF > rest +def priority(i): + t = i['title'].lower() + if '[urgent]' in t or 'urgent:' in t: return 0 + if '[p0]' in t: return 1 + if '[p1]' in t: return 2 + if '[bug]' in t: return 3 + if 'lhf:' in t or 'lhf ' in t.lower(): return 4 + if '[p2]' in t: return 5 + return 6 + +all_issues.sort(key=priority) + +for i in all_issues: + assignees = [a['login'] for a in (i.get('assignees') or [])] + # Take issues assigned to claude OR unassigned (self-assign) + if assignees and 'claude' not in assignees: + continue + + title = i['title'].lower() + if '[philosophy]' in title: continue + if '[epic]' in title or 'epic:' in title: continue + if '[showcase]' in title: continue + if '[do not close' in title: continue + if '[meta]' in title: continue + if '[governing]' in title: continue + if '[permanent]' in title: continue + if '[morning report]' in title: continue + if '[retro]' in title: continue + if '[intel]' in title: continue + if 'master escalation' in title: continue + if any(a['login'] == 'Rockachopa' for a in (i.get('assignees') or [])): continue + + num_str = str(i['number']) + if num_str in active_issues: continue + + entry = skips.get(num_str, {}) + if entry and entry.get('until', 0) > time.time(): continue + + lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock' + if os.path.isdir(lock): continue + + repo = i['_repo'] + owner, name = repo.split('/') + + # Self-assign if unassigned + if not assignees: + try: + data = json.dumps({'assignees': ['claude']}).encode() + req2 = urllib.request.Request( + f'{base}/api/v1/repos/{repo}/issues/{i[\"number\"]}', + data=data, method='PATCH', + headers={'Authorization': f'token {token}', 'Content-Type': 'application/json'}) + urllib.request.urlopen(req2, timeout=5) + except: pass + + print(json.dumps({ + 'number': i['number'], + 'title': i['title'], + 'repo_owner': owner, + 'repo_name': name, + 'repo': repo, + })) + sys.exit(0) + +print('null') +" 2>/dev/null +} + +build_prompt() { + local issue_num="$1" + local issue_title="$2" + local worktree="$3" + local repo_owner="$4" + local repo_name="$5" + + cat < (#${issue_num})", "body": "Fixes #${issue_num}\n\n", "head": "claude/issue-${issue_num}", "base": "main"}' + +5. COMMENT on the issue when done: + curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\ + -H "Authorization: token ${GITEA_TOKEN}" \\ + -H "Content-Type: application/json" \\ + -d '{"body": "PR created. "}' + +== RULES == +- Read CLAUDE.md or project README first for conventions +- If the project has tox, use tox. If npm, use npm. Follow the project. +- Never use --no-verify on git commands. +- If tests fail after 2 attempts, STOP and comment on the issue explaining why. +- Be thorough but focused. Fix the issue, don't refactor the world. + +== CRITICAL: ALWAYS COMMIT AND PUSH == +- NEVER exit without committing your work. Even partial progress MUST be committed. +- Before you finish, ALWAYS: git add -A && git commit && git push origin claude/issue-${issue_num} +- ALWAYS create a PR before exiting. No exceptions. +- If a branch already exists with prior work, check it out and CONTINUE from where it left off. +- Check: git ls-remote origin claude/issue-${issue_num} — if it exists, pull it first. +- Your work is WASTED if it's not pushed. Push early, push often. +PROMPT +} + +# === WORKER FUNCTION === +run_worker() { + local worker_id="$1" + local consecutive_failures=0 + + log "WORKER-${worker_id}: Started" + + while true; do + # Backoff on repeated failures + if [ "$consecutive_failures" -ge 5 ]; then + local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5))) + [ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP + log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)" + sleep "$backoff" + consecutive_failures=0 + fi + + # RULE: Merge existing PRs BEFORE creating new work. + # Check for open PRs from claude, rebase + merge them first. + local our_prs + our_prs=$(curl -sf -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls?state=open&limit=5" 2>/dev/null | \ + python3 -c " +import sys, json +prs = json.loads(sys.stdin.buffer.read()) +ours = [p for p in prs if p['user']['login'] == 'claude'][:3] +for p in ours: + print(f'{p[\"number\"]}|{p[\"head\"][\"ref\"]}|{p.get(\"mergeable\",False)}') +" 2>/dev/null) + + if [ -n "$our_prs" ]; then + local pr_clone_url="http://claude:${GITEA_TOKEN}@143.198.27.163:3000/Timmy_Foundation/the-nexus.git" + echo "$our_prs" | while IFS='|' read pr_num branch mergeable; do + [ -z "$pr_num" ] && continue + if [ "$mergeable" = "True" ]; then + curl -sf -X POST -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"Do":"squash","delete_branch_after_merge":true}' \ + "${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}/merge" >/dev/null 2>&1 + log "WORKER-${worker_id}: merged own PR #${pr_num}" + sleep 3 + else + # Rebase and push + local tmpdir="/tmp/claude-rebase-${pr_num}" + cd "$HOME"; rm -rf "$tmpdir" 2>/dev/null + git clone -q --depth=50 -b "$branch" "$pr_clone_url" "$tmpdir" 2>/dev/null + if [ -d "$tmpdir/.git" ]; then + cd "$tmpdir" + git fetch origin main 2>/dev/null + if git rebase origin/main 2>/dev/null; then + git push -f origin "$branch" 2>/dev/null + sleep 3 + curl -sf -X POST -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"Do":"squash","delete_branch_after_merge":true}' \ + "${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}/merge" >/dev/null 2>&1 + log "WORKER-${worker_id}: rebased+merged PR #${pr_num}" + else + git rebase --abort 2>/dev/null + curl -sf -X PATCH -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" -d '{"state":"closed"}' \ + "${GITEA_URL}/api/v1/repos/Timmy_Foundation/the-nexus/pulls/${pr_num}" >/dev/null 2>&1 + log "WORKER-${worker_id}: closed unrebaseable PR #${pr_num}" + fi + cd "$HOME"; rm -rf "$tmpdir" + fi + fi + done + fi + + # Get next issue + issue_json=$(get_next_issue) + + if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then + update_active "$worker_id" "" "" "idle" + sleep 10 + continue + fi + + issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])") + issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])") + repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])") + repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])") + issue_key="${repo_owner}-${repo_name}-${issue_num}" + branch="claude/issue-${issue_num}" + # Use UUID for worktree dir to prevent collisions under high concurrency + wt_uuid=$(/usr/bin/uuidgen 2>/dev/null || python3 -c "import uuid; print(uuid.uuid4())") + worktree="${WORKTREE_BASE}/claude-${issue_num}-${wt_uuid}" + + # Try to lock + if ! lock_issue "$issue_key"; then + sleep 5 + continue + fi + + log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ===" + update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working" + + # Clone and pick up prior work if it exists + rm -rf "$worktree" 2>/dev/null + CLONE_URL="http://claude:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" + + # Check if branch already exists on remote (prior work to continue) + if git ls-remote --heads "$CLONE_URL" "$branch" 2>/dev/null | grep -q "$branch"; then + log "WORKER-${worker_id}: Found existing branch $branch — continuing prior work" + if ! git clone --depth=50 -b "$branch" "$CLONE_URL" "$worktree" >/dev/null 2>&1; then + log "WORKER-${worker_id}: ERROR cloning branch $branch for #${issue_num}" + unlock_issue "$issue_key" + consecutive_failures=$((consecutive_failures + 1)) + sleep "$COOLDOWN" + continue + fi + # Rebase on main to resolve stale conflicts from closed PRs + cd "$worktree" + git fetch origin main >/dev/null 2>&1 + if ! git rebase origin/main >/dev/null 2>&1; then + # Rebase failed — start fresh from main + log "WORKER-${worker_id}: Rebase failed for $branch, starting fresh" + cd "$HOME" + rm -rf "$worktree" + git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1 + cd "$worktree" + git checkout -b "$branch" >/dev/null 2>&1 + fi + else + if ! git clone --depth=1 -b main "$CLONE_URL" "$worktree" >/dev/null 2>&1; then + log "WORKER-${worker_id}: ERROR cloning for #${issue_num}" + unlock_issue "$issue_key" + consecutive_failures=$((consecutive_failures + 1)) + sleep "$COOLDOWN" + continue + fi + cd "$worktree" + git checkout -b "$branch" >/dev/null 2>&1 + fi + cd "$worktree" + + # Build prompt and run + prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name") + + log "WORKER-${worker_id}: Launching Claude Code for #${issue_num}..." + CYCLE_START=$(date +%s) + + set +e + cd "$worktree" + env -u CLAUDECODE gtimeout "$CLAUDE_TIMEOUT" claude \ + --print \ + --model sonnet \ + --dangerously-skip-permissions \ + -p "$prompt" \ + > "$LOG_DIR/claude-${issue_num}.log" 2>&1 + exit_code=$? + set -e + + CYCLE_END=$(date +%s) + CYCLE_DURATION=$(( CYCLE_END - CYCLE_START )) + + # ── SALVAGE: Never waste work. Commit+push whatever exists. ── + cd "$worktree" 2>/dev/null || true + DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ') + UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ') + + if [ "${DIRTY:-0}" -gt 0 ]; then + log "WORKER-${worker_id}: SALVAGING $DIRTY dirty files for #${issue_num}" + git add -A 2>/dev/null + git commit -m "WIP: Claude Code progress on #${issue_num} + +Automated salvage commit — agent session ended (exit $exit_code). +Work in progress, may need continuation." 2>/dev/null || true + fi + + # Push if we have any commits (including salvaged ones) + UNPUSHED=$(git log --oneline "origin/main..HEAD" 2>/dev/null | wc -l | tr -d ' ') + if [ "${UNPUSHED:-0}" -gt 0 ]; then + git push -u origin "$branch" 2>/dev/null && \ + log "WORKER-${worker_id}: Pushed $UNPUSHED commit(s) on $branch" || \ + log "WORKER-${worker_id}: Push failed for $branch" + fi + + # ── Create PR if branch was pushed and no PR exists yet ── + pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \ + -H "Authorization: token ${GITEA_TOKEN}" | python3 -c " +import sys,json +prs = json.load(sys.stdin) +if prs: print(prs[0]['number']) +else: print('') +" 2>/dev/null) + + if [ -z "$pr_num" ] && [ "${UNPUSHED:-0}" -gt 0 ]; then + pr_num=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "$(python3 -c " +import json +print(json.dumps({ + 'title': 'Claude: Issue #${issue_num}', + 'head': '${branch}', + 'base': 'main', + 'body': 'Automated PR for issue #${issue_num}.\nExit code: ${exit_code}' +})) +")" | python3 -c "import sys,json; print(json.load(sys.stdin).get('number',''))" 2>/dev/null) + [ -n "$pr_num" ] && log "WORKER-${worker_id}: Created PR #${pr_num} for issue #${issue_num}" + fi + + # ── Merge + close on success ── + if [ "$exit_code" -eq 0 ]; then + log "WORKER-${worker_id}: SUCCESS #${issue_num}" + + if [ -n "$pr_num" ]; then + curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"Do": "squash"}' >/dev/null 2>&1 || true + curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \ + -H "Authorization: token ${GITEA_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{"state": "closed"}' >/dev/null 2>&1 || true + log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed" + fi + + consecutive_failures=0 + + elif [ "$exit_code" -eq 124 ]; then + log "WORKER-${worker_id}: TIMEOUT #${issue_num} (work saved in PR)" + consecutive_failures=$((consecutive_failures + 1)) + + else + # Check for rate limit + if grep -q "rate_limit\|rate limit\|429\|overloaded" "$LOG_DIR/claude-${issue_num}.log" 2>/dev/null; then + log "WORKER-${worker_id}: RATE LIMITED on #${issue_num} — backing off (work saved)" + consecutive_failures=$((consecutive_failures + 3)) + else + log "WORKER-${worker_id}: FAILED #${issue_num} exit ${exit_code} (work saved in PR)" + consecutive_failures=$((consecutive_failures + 1)) + fi + fi + + # ── METRICS: structured JSONL for reporting ── + LINES_ADDED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo 0) + LINES_REMOVED=$(cd "$worktree" 2>/dev/null && git diff --stat origin/main..HEAD 2>/dev/null | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo 0) + FILES_CHANGED=$(cd "$worktree" 2>/dev/null && git diff --name-only origin/main..HEAD 2>/dev/null | wc -l | tr -d ' ' || echo 0) + + # Determine outcome + if [ "$exit_code" -eq 0 ]; then + OUTCOME="success" + elif [ "$exit_code" -eq 124 ]; then + OUTCOME="timeout" + elif grep -q "rate_limit\|rate limit\|429" "$LOG_DIR/claude-${issue_num}.log" 2>/dev/null; then + OUTCOME="rate_limited" + else + OUTCOME="failed" + fi + + METRICS_FILE="$LOG_DIR/claude-metrics.jsonl" + python3 -c " +import json, datetime +print(json.dumps({ + 'ts': datetime.datetime.utcnow().isoformat() + 'Z', + 'worker': $worker_id, + 'issue': $issue_num, + 'repo': '${repo_owner}/${repo_name}', + 'title': '''${issue_title}'''[:80], + 'outcome': '$OUTCOME', + 'exit_code': $exit_code, + 'duration_s': $CYCLE_DURATION, + 'files_changed': ${FILES_CHANGED:-0}, + 'lines_added': ${LINES_ADDED:-0}, + 'lines_removed': ${LINES_REMOVED:-0}, + 'salvaged': ${DIRTY:-0}, + 'pr': '${pr_num:-}', + 'merged': $( [ '$OUTCOME' = 'success' ] && [ -n '${pr_num:-}' ] && echo 'true' || echo 'false' ) +})) +" >> "$METRICS_FILE" 2>/dev/null + + # Cleanup + cleanup_workdir "$worktree" + unlock_issue "$issue_key" + update_active "$worker_id" "" "" "done" + + sleep "$COOLDOWN" + done +} + +# === MAIN === +log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ===" +log "Worktrees: ${WORKTREE_BASE}" + +# Clean stale locks +rm -rf "$LOCK_DIR"/*.lock 2>/dev/null + +# PID tracking via files (bash 3.2 compatible) +PID_DIR="$LOG_DIR/claude-pids" +mkdir -p "$PID_DIR" +rm -f "$PID_DIR"/*.pid 2>/dev/null + +launch_worker() { + local wid="$1" + run_worker "$wid" & + echo $! > "$PID_DIR/${wid}.pid" + log "Launched worker $wid (PID $!)" +} + +# Initial launch +for i in $(seq 1 "$NUM_WORKERS"); do + launch_worker "$i" + sleep 3 +done + +# === DYNAMIC SCALER === +# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits +CURRENT_WORKERS="$NUM_WORKERS" +while true; do + sleep 90 + + # Reap dead workers and relaunch + for pidfile in "$PID_DIR"/*.pid; do + [ -f "$pidfile" ] || continue + wid=$(basename "$pidfile" .pid) + wpid=$(cat "$pidfile") + if ! kill -0 "$wpid" 2>/dev/null; then + log "SCALER: Worker $wid died — relaunching" + launch_worker "$wid" + sleep 2 + fi + done + + recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true) + recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true) + + if [ "$recent_rate_limits" -gt 0 ]; then + if [ "$CURRENT_WORKERS" -gt 2 ]; then + drop_to=$(( CURRENT_WORKERS / 2 )) + [ "$drop_to" -lt 2 ] && drop_to=2 + log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers" + for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do + if [ -f "$PID_DIR/${wid}.pid" ]; then + kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true + rm -f "$PID_DIR/${wid}.pid" + update_active "$wid" "" "" "done" + fi + done + CURRENT_WORKERS=$drop_to + fi + elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then + new_count=$(( CURRENT_WORKERS + 2 )) + [ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS + log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers" + for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do + launch_worker "$wid" + sleep 2 + done + CURRENT_WORKERS=$new_count + fi +done diff --git a/bin/claudemax-watchdog.sh b/bin/claudemax-watchdog.sh new file mode 100755 index 00000000..13553008 --- /dev/null +++ b/bin/claudemax-watchdog.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# claudemax-watchdog.sh — keep local Claude/Gemini loops alive without stale tmux assumptions + +set -uo pipefail +export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH" + +LOG="$HOME/.hermes/logs/claudemax-watchdog.log" +GITEA_URL="http://143.198.27.163:3000" +GITEA_TOKEN=$(tr -d '[:space:]' < "$HOME/.hermes/gitea_token_vps" 2>/dev/null || true) +REPO_API="$GITEA_URL/api/v1/repos/Timmy_Foundation/the-nexus" +MIN_OPEN_ISSUES=10 +CLAUDE_WORKERS=2 +GEMINI_WORKERS=1 + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] CLAUDEMAX: $*" >> "$LOG" +} + +start_loop() { + local name="$1" + local pattern="$2" + local cmd="$3" + local pid + + pid=$(pgrep -f "$pattern" 2>/dev/null | head -1 || true) + if [ -n "$pid" ]; then + log "$name alive (PID $pid)" + return 0 + fi + + log "$name not running. Restarting..." + nohup bash -lc "$cmd" >/dev/null 2>&1 & + sleep 2 + + pid=$(pgrep -f "$pattern" 2>/dev/null | head -1 || true) + if [ -n "$pid" ]; then + log "Restarted $name (PID $pid)" + else + log "ERROR: failed to start $name" + fi +} + +run_optional_script() { + local label="$1" + local script_path="$2" + + if [ -x "$script_path" ]; then + bash "$script_path" 2>&1 | while read -r line; do + log "$line" + done + else + log "$label skipped — missing $script_path" + fi +} + +claude_quota_blocked() { + local cutoff now mtime f + now=$(date +%s) + cutoff=$((now - 43200)) + for f in "$HOME"/.hermes/logs/claude-*.log; do + [ -f "$f" ] || continue + mtime=$(stat -f %m "$f" 2>/dev/null || echo 0) + if [ "$mtime" -ge "$cutoff" ] && grep -q "You've hit your limit" "$f" 2>/dev/null; then + return 0 + fi + done + return 1 +} + +if [ -z "$GITEA_TOKEN" ]; then + log "ERROR: missing Gitea token at ~/.hermes/gitea_token_vps" + exit 1 +fi + +if claude_quota_blocked; then + log "Claude quota exhausted recently — not starting claude-loop until quota resets or logs age out" +else + start_loop "claude-loop" "bash .*claude-loop.sh" "bash ~/.hermes/bin/claude-loop.sh $CLAUDE_WORKERS >> ~/.hermes/logs/claude-loop.log 2>&1" +fi +start_loop "gemini-loop" "bash .*gemini-loop.sh" "bash ~/.hermes/bin/gemini-loop.sh $GEMINI_WORKERS >> ~/.hermes/logs/gemini-loop.log 2>&1" + +OPEN_COUNT=$(curl -s --max-time 10 -H "Authorization: token $GITEA_TOKEN" \ + "$REPO_API/issues?state=open&type=issues&limit=100" 2>/dev/null \ + | python3 -c "import sys, json; print(len(json.loads(sys.stdin.read() or '[]')))" 2>/dev/null || echo 0) + +log "Open issues: $OPEN_COUNT (minimum: $MIN_OPEN_ISSUES)" + +if [ "$OPEN_COUNT" -lt "$MIN_OPEN_ISSUES" ]; then + log "Backlog running low. Checking replenishment helper..." + run_optional_script "claudemax-replenish" "$HOME/.hermes/bin/claudemax-replenish.sh" +fi + +run_optional_script "autodeploy-matrix" "$HOME/.hermes/bin/autodeploy-matrix.sh" +log "Watchdog complete."