feat: parallel workers for all agents, dynamic scaling, self-healing watchdog
- claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -73,6 +73,7 @@ bin/*
|
|||||||
!bin/timmy-loopstat.sh
|
!bin/timmy-loopstat.sh
|
||||||
!bin/start-dashboard.sh
|
!bin/start-dashboard.sh
|
||||||
!bin/gemini-loop.sh
|
!bin/gemini-loop.sh
|
||||||
|
!bin/loop-watchdog.sh
|
||||||
|
|
||||||
# ── Queue (transient task queue) ─────────────────────────────────────
|
# ── Queue (transient task queue) ─────────────────────────────────────
|
||||||
queue/
|
queue/
|
||||||
|
|||||||
@@ -8,12 +8,13 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# === CONFIG ===
|
# === CONFIG ===
|
||||||
NUM_WORKERS="${1:-3}"
|
NUM_WORKERS="${1:-7}"
|
||||||
|
MAX_WORKERS=21 # absolute ceiling
|
||||||
WORKTREE_BASE="$HOME/worktrees"
|
WORKTREE_BASE="$HOME/worktrees"
|
||||||
GITEA_URL="http://143.198.27.163:3000"
|
GITEA_URL="http://143.198.27.163:3000"
|
||||||
GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
|
GITEA_TOKEN=$(cat "$HOME/.hermes/claude_token")
|
||||||
CLAUDE_TIMEOUT=900 # 15 min per issue
|
CLAUDE_TIMEOUT=900 # 15 min per issue
|
||||||
COOLDOWN=15 # seconds between launching workers
|
COOLDOWN=5 # seconds between issues (fast cycle)
|
||||||
RATE_LIMIT_SLEEP=60 # initial sleep on rate limit
|
RATE_LIMIT_SLEEP=60 # initial sleep on rate limit
|
||||||
MAX_RATE_SLEEP=300 # max backoff on rate limit
|
MAX_RATE_SLEEP=300 # max backoff on rate limit
|
||||||
LOG_DIR="$HOME/.hermes/logs"
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
@@ -402,18 +403,73 @@ else: print('')
|
|||||||
}
|
}
|
||||||
|
|
||||||
# === MAIN ===
|
# === MAIN ===
|
||||||
log "=== Claude Loop Started — ${NUM_WORKERS} workers ==="
|
log "=== Claude Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
|
||||||
log "Worktrees: ${WORKTREE_BASE}"
|
log "Worktrees: ${WORKTREE_BASE}"
|
||||||
|
|
||||||
# Clean stale locks
|
# Clean stale locks
|
||||||
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
||||||
|
|
||||||
# Launch workers
|
# PID tracking via files (bash 3.2 compatible)
|
||||||
|
PID_DIR="$LOG_DIR/claude-pids"
|
||||||
|
mkdir -p "$PID_DIR"
|
||||||
|
rm -f "$PID_DIR"/*.pid 2>/dev/null
|
||||||
|
|
||||||
|
launch_worker() {
|
||||||
|
local wid="$1"
|
||||||
|
run_worker "$wid" &
|
||||||
|
echo $! > "$PID_DIR/${wid}.pid"
|
||||||
|
log "Launched worker $wid (PID $!)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initial launch
|
||||||
for i in $(seq 1 "$NUM_WORKERS"); do
|
for i in $(seq 1 "$NUM_WORKERS"); do
|
||||||
run_worker "$i" &
|
launch_worker "$i"
|
||||||
log "Launched worker $i (PID $!)"
|
sleep 3
|
||||||
sleep 5 # stagger starts
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Wait for all workers
|
# === DYNAMIC SCALER ===
|
||||||
wait
|
# Every 3 minutes: check health, scale up if no rate limits, scale down if hitting limits
|
||||||
|
CURRENT_WORKERS="$NUM_WORKERS"
|
||||||
|
while true; do
|
||||||
|
sleep 180
|
||||||
|
|
||||||
|
# Reap dead workers and relaunch
|
||||||
|
for pidfile in "$PID_DIR"/*.pid; do
|
||||||
|
[ -f "$pidfile" ] || continue
|
||||||
|
wid=$(basename "$pidfile" .pid)
|
||||||
|
wpid=$(cat "$pidfile")
|
||||||
|
if ! kill -0 "$wpid" 2>/dev/null; then
|
||||||
|
log "SCALER: Worker $wid died — relaunching"
|
||||||
|
launch_worker "$wid"
|
||||||
|
sleep 2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
recent_rate_limits=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
|
||||||
|
recent_successes=$(tail -100 "$LOG_DIR/claude-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
|
||||||
|
|
||||||
|
if [ "$recent_rate_limits" -gt 0 ]; then
|
||||||
|
if [ "$CURRENT_WORKERS" -gt 2 ]; then
|
||||||
|
drop_to=$(( CURRENT_WORKERS / 2 ))
|
||||||
|
[ "$drop_to" -lt 2 ] && drop_to=2
|
||||||
|
log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to} workers"
|
||||||
|
for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
|
||||||
|
if [ -f "$PID_DIR/${wid}.pid" ]; then
|
||||||
|
kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
|
||||||
|
rm -f "$PID_DIR/${wid}.pid"
|
||||||
|
update_active "$wid" "" "" "done"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
CURRENT_WORKERS=$drop_to
|
||||||
|
fi
|
||||||
|
elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
|
||||||
|
new_count=$(( CURRENT_WORKERS + 2 ))
|
||||||
|
[ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
|
||||||
|
log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count} workers"
|
||||||
|
for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
|
||||||
|
launch_worker "$wid"
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
CURRENT_WORKERS=$new_count
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|||||||
@@ -1,321 +1,437 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# gemini-loop.sh — Dropout-proof Gemini code agent dispatch loop
|
# gemini-loop.sh — Parallel Gemini Code agent dispatch loop
|
||||||
# Picks an open issue from Gitea, creates a worktree, runs Gemini Code CLI,
|
# Runs N workers concurrently against the Gitea backlog.
|
||||||
# handles failures gracefully, and loops forever.
|
# Dynamic scaling: starts at N, scales up to MAX, drops on rate limits.
|
||||||
#
|
#
|
||||||
# Dropout-proof means:
|
# Usage: gemini-loop.sh [NUM_WORKERS] (default: 3)
|
||||||
# - If Gemini Code crashes/hangs, we kill it and move on
|
|
||||||
# - If worktree creation fails, skip and retry
|
|
||||||
# - If push fails, log and continue
|
|
||||||
# - Exponential backoff on repeated failures
|
|
||||||
# - Clean up worktrees after PR is created
|
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# === CONFIG ===
|
# === CONFIG ===
|
||||||
REPO_DIR="$HOME/worktrees/gemini-repo"
|
NUM_WORKERS="${1:-3}"
|
||||||
|
MAX_WORKERS=12
|
||||||
WORKTREE_BASE="$HOME/worktrees"
|
WORKTREE_BASE="$HOME/worktrees"
|
||||||
GITEA_URL="http://143.198.27.163:3000"
|
GITEA_URL="http://143.198.27.163:3000"
|
||||||
GITEA_TOKEN=$(cat "$HOME/.hermes/gemini_token")
|
GITEA_TOKEN=$(cat "$HOME/.hermes/gemini_token")
|
||||||
REPO_OWNER="rockachopa"
|
|
||||||
REPO_NAME="Timmy-time-dashboard"
|
|
||||||
GEMINI_TIMEOUT=600 # 10 min per issue
|
GEMINI_TIMEOUT=600 # 10 min per issue
|
||||||
COOLDOWN=30 # seconds between issues
|
COOLDOWN=5 # seconds between issues
|
||||||
MAX_FAILURES=5 # consecutive failures before long sleep
|
RATE_LIMIT_SLEEP=60
|
||||||
LONG_SLEEP=300 # 5 min backoff on repeated failures
|
MAX_RATE_SLEEP=300
|
||||||
LOG_DIR="$HOME/.hermes/logs"
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
SKIP_FILE="$LOG_DIR/gemini-skip-list.json" # issues to skip temporarily
|
SKIP_FILE="$LOG_DIR/gemini-skip-list.json"
|
||||||
|
LOCK_DIR="$LOG_DIR/gemini-locks"
|
||||||
|
ACTIVE_FILE="$LOG_DIR/gemini-active.json"
|
||||||
|
|
||||||
mkdir -p "$LOG_DIR" "$WORKTREE_BASE"
|
mkdir -p "$LOG_DIR" "$WORKTREE_BASE" "$LOCK_DIR"
|
||||||
|
|
||||||
# Initialize skip file if missing
|
|
||||||
[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
|
[ -f "$SKIP_FILE" ] || echo '{}' > "$SKIP_FILE"
|
||||||
|
echo '{}' > "$ACTIVE_FILE"
|
||||||
|
|
||||||
# === STATE ===
|
# === SHARED FUNCTIONS ===
|
||||||
failure_count=0
|
log() {
|
||||||
issues_completed=0
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_DIR/gemini-loop.log"
|
||||||
|
}
|
||||||
|
|
||||||
# === SKIP LIST FUNCTIONS ===
|
lock_issue() {
|
||||||
is_skipped() {
|
local issue_key="$1"
|
||||||
local issue_num="$1"
|
local lockfile="$LOCK_DIR/$issue_key.lock"
|
||||||
python3 -c "
|
if mkdir "$lockfile" 2>/dev/null; then
|
||||||
import json, time, sys
|
echo $$ > "$lockfile/pid"
|
||||||
try:
|
return 0
|
||||||
with open('$SKIP_FILE') as f: skips = json.load(f)
|
fi
|
||||||
except: skips = {}
|
return 1
|
||||||
entry = skips.get(str($issue_num), {})
|
}
|
||||||
if entry and entry.get('until', 0) > time.time():
|
|
||||||
print('skip')
|
unlock_issue() {
|
||||||
sys.exit(0)
|
rm -rf "$LOCK_DIR/$1.lock" 2>/dev/null
|
||||||
# Expired or not found — clean up and allow
|
|
||||||
if str($issue_num) in skips:
|
|
||||||
del skips[str($issue_num)]
|
|
||||||
with open('$SKIP_FILE', 'w') as f: json.dump(skips, f)
|
|
||||||
print('ok')
|
|
||||||
" 2>/dev/null
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mark_skip() {
|
mark_skip() {
|
||||||
local issue_num="$1"
|
local issue_num="$1" reason="$2" skip_hours="${3:-1}"
|
||||||
local reason="$2"
|
|
||||||
local skip_hours="${3:-1}" # default 1 hour
|
|
||||||
python3 -c "
|
python3 -c "
|
||||||
import json, time
|
import json, time, fcntl
|
||||||
try:
|
with open('$SKIP_FILE', 'r+') as f:
|
||||||
with open('$SKIP_FILE') as f: skips = json.load(f)
|
fcntl.flock(f, fcntl.LOCK_EX)
|
||||||
except: skips = {}
|
try: skips = json.load(f)
|
||||||
skips[str($issue_num)] = {
|
except: skips = {}
|
||||||
'until': time.time() + ($skip_hours * 3600),
|
skips[str($issue_num)] = {
|
||||||
'reason': '$reason',
|
'until': time.time() + ($skip_hours * 3600),
|
||||||
'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
|
'reason': '$reason',
|
||||||
}
|
'failures': skips.get(str($issue_num), {}).get('failures', 0) + 1
|
||||||
# If 3+ failures, skip for 6 hours instead
|
}
|
||||||
if skips[str($issue_num)]['failures'] >= 3:
|
if skips[str($issue_num)]['failures'] >= 3:
|
||||||
skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
|
skips[str($issue_num)]['until'] = time.time() + (6 * 3600)
|
||||||
with open('$SKIP_FILE', 'w') as f: json.dump(skips, f, indent=2)
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
json.dump(skips, f, indent=2)
|
||||||
" 2>/dev/null
|
" 2>/dev/null
|
||||||
log "SKIP: #${issue_num} added to skip list — ${reason}"
|
log "SKIP: #${issue_num} — ${reason}"
|
||||||
}
|
}
|
||||||
|
|
||||||
log() {
|
update_active() {
|
||||||
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
local worker="$1" issue="$2" repo="$3" status="$4"
|
||||||
echo "$msg" >> "$LOG_DIR/gemini-loop.log"
|
python3 -c "
|
||||||
|
import json, fcntl
|
||||||
|
with open('$ACTIVE_FILE', 'r+') as f:
|
||||||
|
fcntl.flock(f, fcntl.LOCK_EX)
|
||||||
|
try: active = json.load(f)
|
||||||
|
except: active = {}
|
||||||
|
if '$status' == 'done':
|
||||||
|
active.pop('$worker', None)
|
||||||
|
else:
|
||||||
|
active['$worker'] = {'issue': '$issue', 'repo': '$repo', 'status': '$status'}
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
json.dump(active, f, indent=2)
|
||||||
|
" 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_worktree() {
|
cleanup_worktree() {
|
||||||
local wt="$1"
|
local wt="$1" branch="$2"
|
||||||
local branch="$2"
|
|
||||||
if [ -d "$wt" ]; then
|
if [ -d "$wt" ]; then
|
||||||
cd "$REPO_DIR"
|
local parent
|
||||||
|
parent=$(git -C "$wt" rev-parse --git-common-dir 2>/dev/null | sed 's|/.git$||' || true)
|
||||||
|
[ -n "$parent" ] && [ -d "$parent" ] && cd "$parent"
|
||||||
git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt"
|
git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt"
|
||||||
git worktree prune 2>/dev/null
|
git worktree prune 2>/dev/null
|
||||||
git branch -D "$branch" 2>/dev/null || true
|
git branch -D "$branch" 2>/dev/null || true
|
||||||
log "Cleaned up worktree: $wt"
|
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
get_next_issue() {
|
get_next_issue() {
|
||||||
# Get open issues ASSIGNED TO GEMINI only — Gemini works its own queue
|
python3 -c "
|
||||||
# NOTE: Gitea's assignee filter is unreliable — we validate in Python
|
import json, sys, time, urllib.request, os
|
||||||
local skip_file="$SKIP_FILE"
|
|
||||||
curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues?state=open&type=issues&limit=50&sort=created" \
|
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
|
||||||
import sys, json, time
|
|
||||||
|
|
||||||
issues = json.load(sys.stdin)
|
token = '${GITEA_TOKEN}'
|
||||||
# Reverse to oldest-first (Gitea returns newest-first) — respects dependency order
|
base = '${GITEA_URL}'
|
||||||
issues.reverse()
|
repos = [
|
||||||
|
'rockachopa/Timmy-time-dashboard',
|
||||||
|
'rockachopa/alexanderwhitestone.com',
|
||||||
|
'rockachopa/hermes-agent',
|
||||||
|
'replit/timmy-tower',
|
||||||
|
'replit/token-gated-economy',
|
||||||
|
]
|
||||||
|
|
||||||
# Load skip list
|
|
||||||
try:
|
try:
|
||||||
with open('${skip_file}') as f: skips = json.load(f)
|
with open('${SKIP_FILE}') as f: skips = json.load(f)
|
||||||
except: skips = {}
|
except: skips = {}
|
||||||
|
|
||||||
for i in issues:
|
try:
|
||||||
# MUST be assigned to gemini (Gitea filter is broken, validate here)
|
with open('${ACTIVE_FILE}') as f:
|
||||||
|
active = json.load(f)
|
||||||
|
active_issues = {v['issue'] for v in active.values()}
|
||||||
|
except:
|
||||||
|
active_issues = set()
|
||||||
|
|
||||||
|
all_issues = []
|
||||||
|
for repo in repos:
|
||||||
|
url = f'{base}/api/v1/repos/{repo}/issues?state=open&type=issues&limit=50&sort=created'
|
||||||
|
req = urllib.request.Request(url, headers={'Authorization': f'token {token}'})
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen(req, timeout=10)
|
||||||
|
issues = json.loads(resp.read())
|
||||||
|
for i in issues:
|
||||||
|
i['_repo'] = repo
|
||||||
|
all_issues.extend(issues)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def priority(i):
|
||||||
|
t = i['title'].lower()
|
||||||
|
if '[urgent]' in t or 'urgent:' in t: return 0
|
||||||
|
if '[p0]' in t: return 1
|
||||||
|
if '[p1]' in t: return 2
|
||||||
|
if '[bug]' in t: return 3
|
||||||
|
if 'lhf:' in t or 'lhf ' in t: return 4
|
||||||
|
if '[p2]' in t: return 5
|
||||||
|
return 6
|
||||||
|
|
||||||
|
all_issues.sort(key=priority)
|
||||||
|
|
||||||
|
for i in all_issues:
|
||||||
assignees = [a['login'] for a in (i.get('assignees') or [])]
|
assignees = [a['login'] for a in (i.get('assignees') or [])]
|
||||||
if 'gemini' not in assignees:
|
if 'gemini' not in assignees:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = i['title'].lower()
|
title = i['title'].lower()
|
||||||
# Skip philosophy, epics, showcases, features (not 10-min code work)
|
|
||||||
if '[philosophy]' in title: continue
|
if '[philosophy]' in title: continue
|
||||||
if '[epic]' in title or 'epic:' in title: continue
|
if '[epic]' in title or 'epic:' in title: continue
|
||||||
if '[showcase]' in title: continue
|
if '[showcase]' in title: continue
|
||||||
if '[feature]' in title: continue
|
|
||||||
|
|
||||||
# Check skip list
|
|
||||||
num_str = str(i['number'])
|
num_str = str(i['number'])
|
||||||
entry = skips.get(num_str, {})
|
if num_str in active_issues: continue
|
||||||
if entry and entry.get('until', 0) > time.time():
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(json.dumps({'number': i['number'], 'title': i['title']}))
|
entry = skips.get(num_str, {})
|
||||||
|
if entry and entry.get('until', 0) > time.time(): continue
|
||||||
|
|
||||||
|
lock = '${LOCK_DIR}/' + i['_repo'].replace('/', '-') + '-' + num_str + '.lock'
|
||||||
|
if os.path.isdir(lock): continue
|
||||||
|
|
||||||
|
repo = i['_repo']
|
||||||
|
owner, name = repo.split('/')
|
||||||
|
print(json.dumps({
|
||||||
|
'number': i['number'],
|
||||||
|
'title': i['title'],
|
||||||
|
'repo_owner': owner,
|
||||||
|
'repo_name': name,
|
||||||
|
'repo': repo,
|
||||||
|
}))
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
print('null')
|
print('null')
|
||||||
" 2>/dev/null
|
" 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
build_prompt() {
|
build_prompt() {
|
||||||
local issue_num="$1"
|
local issue_num="$1" issue_title="$2" worktree="$3" repo_owner="$4" repo_name="$5"
|
||||||
local issue_title="$2"
|
|
||||||
local worktree="$3"
|
|
||||||
|
|
||||||
cat <<PROMPT
|
cat <<PROMPT
|
||||||
You are Gemini, an autonomous code agent on the Timmy-time-dashboard project.
|
You are Gemini, an autonomous code agent on the ${repo_name} project.
|
||||||
|
|
||||||
YOUR ISSUE: #${issue_num} — "${issue_title}"
|
YOUR ISSUE: #${issue_num} — "${issue_title}"
|
||||||
|
|
||||||
GITEA API: ${GITEA_URL}/api/v1
|
GITEA API: ${GITEA_URL}/api/v1
|
||||||
GITEA TOKEN: ${GITEA_TOKEN}
|
GITEA TOKEN: ${GITEA_TOKEN}
|
||||||
REPO: ${REPO_OWNER}/${REPO_NAME}
|
REPO: ${repo_owner}/${repo_name}
|
||||||
WORKING DIRECTORY: ${worktree}
|
WORKING DIRECTORY: ${worktree}
|
||||||
|
|
||||||
== YOUR POWERS ==
|
== YOUR POWERS ==
|
||||||
You can do ANYTHING a developer can do. You are not limited to the narrow task.
|
You can do ANYTHING a developer can do.
|
||||||
|
|
||||||
1. READ the issue. Read any comments — they may have instructions.
|
1. READ the issue and any comments for context:
|
||||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}"
|
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}"
|
||||||
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}/comments"
|
curl -s -H "Authorization: token ${GITEA_TOKEN}" "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments"
|
||||||
|
|
||||||
2. DO THE WORK. Code, test, fix, refactor — whatever the issue needs.
|
2. DO THE WORK. Code, test, fix, refactor — whatever the issue needs.
|
||||||
- tox -e format (auto-format first)
|
- Check for tox.ini / Makefile / package.json for test/lint commands
|
||||||
- tox -e unit (all tests must pass)
|
- Run tests if the project has them
|
||||||
- tox -e lint (must be clean)
|
- Follow existing code conventions
|
||||||
|
|
||||||
3. COMMIT with conventional commits: fix: / feat: / refactor: / test: / chore:
|
3. COMMIT with conventional commits: fix: / feat: / refactor: / test: / chore:
|
||||||
Include "Fixes #${issue_num}" or "Refs #${issue_num}" in the message.
|
Include "Fixes #${issue_num}" or "Refs #${issue_num}" in the message.
|
||||||
|
|
||||||
4. PUSH to your branch (gemini/issue-${issue_num}) and CREATE A PR:
|
4. PUSH to your branch (gemini/issue-${issue_num}) and CREATE A PR:
|
||||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls" \
|
git push origin gemini/issue-${issue_num}
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls" \\
|
||||||
-H "Content-Type: application/json" \
|
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
-d '{"title": "[gemini] <description> (#${issue_num})", "body": "Fixes #${issue_num}\n\n<describe what you did>", "head": "gemini/issue-${issue_num}", "base": "main"}'
|
-d '{"title": "[gemini] <description> (#${issue_num})", "body": "Fixes #${issue_num}\n\n<describe what you did>", "head": "gemini/issue-${issue_num}", "base": "main"}'
|
||||||
|
|
||||||
5. COMMENT on the issue when done:
|
5. COMMENT on the issue when done:
|
||||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}/comments" \
|
curl -s -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}/comments" \\
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
-H "Authorization: token ${GITEA_TOKEN}" \\
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \\
|
||||||
-d '{"body": "PR created. <summary of changes>"}'
|
-d '{"body": "PR created. <summary of changes>"}'
|
||||||
|
|
||||||
6. FILE NEW ISSUES if you find bugs, missing tests, or improvements while working:
|
|
||||||
curl -s -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues" \
|
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"title": "[gemini-generated] <title>", "body": "<description>"}'
|
|
||||||
|
|
||||||
== RULES ==
|
== RULES ==
|
||||||
- Read CLAUDE.md or project README first for conventions
|
- Read CLAUDE.md or project README first for conventions
|
||||||
- tox is the ONLY way to run tests/lint/format. Never run pytest/ruff directly.
|
- If the project has tox, use tox. If npm, use npm. Follow the project.
|
||||||
- Never use --no-verify on git commands.
|
- Never use --no-verify on git commands.
|
||||||
- If tests fail after 2 attempts, STOP and comment on the issue explaining why.
|
- If tests fail after 2 attempts, STOP and comment on the issue explaining why.
|
||||||
- Be thorough. If you see something broken nearby, file an issue for it.
|
- Be thorough but focused. Fix the issue, don't refactor the world.
|
||||||
PROMPT
|
PROMPT
|
||||||
}
|
}
|
||||||
|
|
||||||
# === MAIN LOOP ===
|
# === WORKER FUNCTION ===
|
||||||
log "=== Gemini Loop Started ==="
|
run_worker() {
|
||||||
log "Repo: ${REPO_DIR}"
|
local worker_id="$1"
|
||||||
log "Worktrees: ${WORKTREE_BASE}"
|
local consecutive_failures=0
|
||||||
|
|
||||||
while true; do
|
log "WORKER-${worker_id}: Started"
|
||||||
# Check for too many consecutive failures
|
|
||||||
if [ "$failure_count" -ge "$MAX_FAILURES" ]; then
|
|
||||||
log "BACKOFF: ${failure_count} consecutive failures. Sleeping ${LONG_SLEEP}s..."
|
|
||||||
sleep "$LONG_SLEEP"
|
|
||||||
failure_count=0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Fetch latest main (resilient — never die on git errors)
|
while true; do
|
||||||
cd "$REPO_DIR"
|
if [ "$consecutive_failures" -ge 5 ]; then
|
||||||
timeout 60 git fetch origin main 2>/dev/null || { log "WARN: git fetch failed, continuing anyway"; }
|
local backoff=$((RATE_LIMIT_SLEEP * (consecutive_failures / 5)))
|
||||||
git checkout main 2>/dev/null || true
|
[ "$backoff" -gt "$MAX_RATE_SLEEP" ] && backoff=$MAX_RATE_SLEEP
|
||||||
git reset --hard origin/main 2>/dev/null || true
|
log "WORKER-${worker_id}: BACKOFF ${backoff}s (${consecutive_failures} failures)"
|
||||||
|
sleep "$backoff"
|
||||||
# Get next issue
|
consecutive_failures=0
|
||||||
issue_json=$(get_next_issue)
|
|
||||||
|
|
||||||
if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
|
|
||||||
# Only log idle ONCE, then go quiet until work appears
|
|
||||||
if [ "${LAST_STATE:-}" != "idle" ]; then
|
|
||||||
log "Queue empty. Waiting for assignments..."
|
|
||||||
LAST_STATE="idle"
|
|
||||||
fi
|
fi
|
||||||
sleep "$LONG_SLEEP"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
LAST_STATE="working"
|
|
||||||
|
|
||||||
issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
|
issue_json=$(get_next_issue)
|
||||||
issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
|
|
||||||
branch="gemini/issue-${issue_num}"
|
|
||||||
worktree="${WORKTREE_BASE}/gemini-${issue_num}"
|
|
||||||
|
|
||||||
log "=== ISSUE #${issue_num}: ${issue_title} ==="
|
if [ "$issue_json" = "null" ] || [ -z "$issue_json" ]; then
|
||||||
|
update_active "$worker_id" "" "" "idle"
|
||||||
|
sleep 60
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
# Create worktree
|
issue_num=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['number'])")
|
||||||
if [ -d "$worktree" ]; then
|
issue_title=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['title'])")
|
||||||
log "Worktree already exists, cleaning..."
|
repo_owner=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_owner'])")
|
||||||
cleanup_worktree "$worktree" "$branch"
|
repo_name=$(echo "$issue_json" | python3 -c "import sys,json; print(json.load(sys.stdin)['repo_name'])")
|
||||||
fi
|
issue_key="${repo_owner}-${repo_name}-${issue_num}"
|
||||||
|
branch="gemini/issue-${issue_num}"
|
||||||
|
worktree="${WORKTREE_BASE}/gemini-w${worker_id}-${issue_num}"
|
||||||
|
|
||||||
cd "$REPO_DIR"
|
if ! lock_issue "$issue_key"; then
|
||||||
if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
|
sleep 5
|
||||||
log "ERROR: Failed to create worktree for #${issue_num}"
|
continue
|
||||||
failure_count=$((failure_count + 1))
|
fi
|
||||||
sleep "$COOLDOWN"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Configure git remote with gemini's token so it can push
|
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||||
cd "$worktree"
|
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
||||||
git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${REPO_OWNER}/${REPO_NAME}.git"
|
|
||||||
cd "$REPO_DIR"
|
|
||||||
|
|
||||||
# Build prompt
|
# Ensure local clone
|
||||||
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree")
|
local_repo="${WORKTREE_BASE}/gemini-base-${repo_owner}-${repo_name}"
|
||||||
|
if [ ! -d "$local_repo" ]; then
|
||||||
|
log "WORKER-${worker_id}: Cloning ${repo_owner}/${repo_name}..."
|
||||||
|
git clone --depth=1 "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" "$local_repo" 2>&1 || {
|
||||||
|
log "WORKER-${worker_id}: ERROR cloning"
|
||||||
|
unlock_issue "$issue_key"
|
||||||
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
|
sleep "$COOLDOWN"
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cd "$local_repo"
|
||||||
|
git fetch --unshallow origin main 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
# Run Gemini Code CLI with timeout
|
cd "$local_repo"
|
||||||
log "Launching Gemini Code for #${issue_num} (timeout: ${GEMINI_TIMEOUT}s)..."
|
timeout 60 git fetch origin main 2>/dev/null || true
|
||||||
|
git checkout main 2>/dev/null || true
|
||||||
|
git reset --hard origin/main 2>/dev/null || true
|
||||||
|
|
||||||
set +e
|
[ -d "$worktree" ] && cleanup_worktree "$worktree" "$branch"
|
||||||
cd "$worktree"
|
cd "$local_repo"
|
||||||
gtimeout "$GEMINI_TIMEOUT" gemini \
|
|
||||||
--print \
|
|
||||||
--quiet \
|
|
||||||
-w "$worktree" \
|
|
||||||
-p "$prompt" \
|
|
||||||
</dev/null 2>&1 | tee "$LOG_DIR/gemini-${issue_num}.log"
|
|
||||||
exit_code=${PIPESTATUS[0]}
|
|
||||||
cd "$REPO_DIR"
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ "$exit_code" -eq 0 ]; then
|
if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
|
||||||
log "SUCCESS: #${issue_num} completed — attempting auto-merge..."
|
log "WORKER-${worker_id}: ERROR creating worktree"
|
||||||
|
unlock_issue "$issue_key"
|
||||||
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
|
sleep "$COOLDOWN"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
# Find and merge the PR gemini created
|
cd "$worktree"
|
||||||
pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls?state=open&head=${REPO_OWNER}:${branch}&limit=1" \
|
git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
|
||||||
|
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
||||||
|
|
||||||
|
log "WORKER-${worker_id}: Launching Gemini Code for #${issue_num}..."
|
||||||
|
|
||||||
|
set +e
|
||||||
|
cd "$worktree"
|
||||||
|
gtimeout "$GEMINI_TIMEOUT" gemini \
|
||||||
|
-p "$prompt" \
|
||||||
|
--yolo \
|
||||||
|
</dev/null >> "$LOG_DIR/gemini-${issue_num}.log" 2>&1
|
||||||
|
exit_code=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$exit_code" -eq 0 ]; then
|
||||||
|
log "WORKER-${worker_id}: SUCCESS #${issue_num}"
|
||||||
|
|
||||||
|
pr_num=$(curl -sf "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls?state=open&head=${repo_owner}:${branch}&limit=1" \
|
||||||
|
-H "Authorization: token ${GITEA_TOKEN}" | python3 -c "
|
||||||
import sys,json
|
import sys,json
|
||||||
prs = json.load(sys.stdin)
|
prs = json.load(sys.stdin)
|
||||||
if prs: print(prs[0]['number'])
|
if prs: print(prs[0]['number'])
|
||||||
else: print('')
|
else: print('')
|
||||||
" 2>/dev/null)
|
" 2>/dev/null)
|
||||||
|
|
||||||
if [ -n "$pr_num" ]; then
|
if [ -n "$pr_num" ]; then
|
||||||
merge_result=$(curl -sf -X POST "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/pulls/${pr_num}/merge" \
|
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/pulls/${pr_num}/merge" \
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{"Do": "squash"}' 2>&1) || true
|
-d '{"Do": "squash"}' >/dev/null 2>&1 || true
|
||||||
log " PR #${pr_num} merge attempted"
|
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${repo_owner}/${repo_name}/issues/${issue_num}" \
|
||||||
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
||||||
|
log "WORKER-${worker_id}: PR #${pr_num} merged, issue #${issue_num} closed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
consecutive_failures=0
|
||||||
|
|
||||||
|
elif [ "$exit_code" -eq 124 ]; then
|
||||||
|
log "WORKER-${worker_id}: TIMEOUT #${issue_num}"
|
||||||
|
mark_skip "$issue_num" "timeout" 1
|
||||||
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
|
|
||||||
# Close the issue (Gitea auto-close via "Fixes #N" is unreliable)
|
|
||||||
curl -sf -X PATCH "${GITEA_URL}/api/v1/repos/${REPO_OWNER}/${REPO_NAME}/issues/${issue_num}" \
|
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"state": "closed"}' >/dev/null 2>&1 || true
|
|
||||||
log " Issue #${issue_num} closed"
|
|
||||||
else
|
else
|
||||||
log " WARN: No open PR found for branch ${branch}"
|
if grep -q "rate_limit\|rate limit\|429\|overloaded\|quota" "$LOG_DIR/gemini-${issue_num}.log" 2>/dev/null; then
|
||||||
|
log "WORKER-${worker_id}: RATE LIMITED on #${issue_num}"
|
||||||
|
mark_skip "$issue_num" "rate_limit" 0.25
|
||||||
|
consecutive_failures=$((consecutive_failures + 3))
|
||||||
|
else
|
||||||
|
log "WORKER-${worker_id}: FAILED #${issue_num} (exit ${exit_code})"
|
||||||
|
mark_skip "$issue_num" "exit_code_${exit_code}" 1
|
||||||
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
failure_count=0
|
cleanup_worktree "$worktree" "$branch"
|
||||||
issues_completed=$((issues_completed + 1))
|
unlock_issue "$issue_key"
|
||||||
log "Stats: ${issues_completed} issues completed this session"
|
update_active "$worker_id" "" "" "done"
|
||||||
elif [ "$exit_code" -eq 124 ]; then
|
|
||||||
log "TIMEOUT: #${issue_num} exceeded ${GEMINI_TIMEOUT}s"
|
|
||||||
mark_skip "$issue_num" "timeout" 1
|
|
||||||
failure_count=$((failure_count + 1))
|
|
||||||
else
|
|
||||||
log "FAILED: #${issue_num} exited with code ${exit_code}"
|
|
||||||
mark_skip "$issue_num" "exit_code_${exit_code}" 1
|
|
||||||
failure_count=$((failure_count + 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Clean up worktree
|
sleep "$COOLDOWN"
|
||||||
cleanup_worktree "$worktree" "$branch"
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# Cooldown
|
# === MAIN ===
|
||||||
log "Cooling down ${COOLDOWN}s before next issue..."
|
log "=== Gemini Loop Started — ${NUM_WORKERS} workers (max ${MAX_WORKERS}) ==="
|
||||||
sleep "$COOLDOWN"
|
log "Worktrees: ${WORKTREE_BASE}"
|
||||||
|
|
||||||
|
rm -rf "$LOCK_DIR"/*.lock 2>/dev/null
|
||||||
|
|
||||||
|
# PID tracking via files (bash 3.2 compatible)
|
||||||
|
PID_DIR="$LOG_DIR/gemini-pids"
|
||||||
|
mkdir -p "$PID_DIR"
|
||||||
|
rm -f "$PID_DIR"/*.pid 2>/dev/null
|
||||||
|
|
||||||
|
launch_worker() {
|
||||||
|
local wid="$1"
|
||||||
|
run_worker "$wid" &
|
||||||
|
echo $! > "$PID_DIR/${wid}.pid"
|
||||||
|
log "Launched worker $wid (PID $!)"
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in $(seq 1 "$NUM_WORKERS"); do
|
||||||
|
launch_worker "$i"
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
|
||||||
|
# Dynamic scaler — every 3 minutes
|
||||||
|
CURRENT_WORKERS="$NUM_WORKERS"
|
||||||
|
while true; do
|
||||||
|
sleep 180
|
||||||
|
|
||||||
|
# Reap dead workers
|
||||||
|
for pidfile in "$PID_DIR"/*.pid; do
|
||||||
|
[ -f "$pidfile" ] || continue
|
||||||
|
wid=$(basename "$pidfile" .pid)
|
||||||
|
wpid=$(cat "$pidfile")
|
||||||
|
if ! kill -0 "$wpid" 2>/dev/null; then
|
||||||
|
log "SCALER: Worker $wid died — relaunching"
|
||||||
|
launch_worker "$wid"
|
||||||
|
sleep 2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
recent_rate_limits=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "RATE LIMITED" || true)
|
||||||
|
recent_successes=$(tail -100 "$LOG_DIR/gemini-loop.log" 2>/dev/null | grep -c "SUCCESS" || true)
|
||||||
|
|
||||||
|
if [ "$recent_rate_limits" -gt 0 ]; then
|
||||||
|
if [ "$CURRENT_WORKERS" -gt 2 ]; then
|
||||||
|
drop_to=$(( CURRENT_WORKERS / 2 ))
|
||||||
|
[ "$drop_to" -lt 2 ] && drop_to=2
|
||||||
|
log "SCALER: Rate limited — scaling ${CURRENT_WORKERS} → ${drop_to}"
|
||||||
|
for wid in $(seq $((drop_to + 1)) "$CURRENT_WORKERS"); do
|
||||||
|
if [ -f "$PID_DIR/${wid}.pid" ]; then
|
||||||
|
kill "$(cat "$PID_DIR/${wid}.pid")" 2>/dev/null || true
|
||||||
|
rm -f "$PID_DIR/${wid}.pid"
|
||||||
|
update_active "$wid" "" "" "done"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
CURRENT_WORKERS=$drop_to
|
||||||
|
fi
|
||||||
|
elif [ "$recent_successes" -ge 2 ] && [ "$CURRENT_WORKERS" -lt "$MAX_WORKERS" ]; then
|
||||||
|
new_count=$(( CURRENT_WORKERS + 2 ))
|
||||||
|
[ "$new_count" -gt "$MAX_WORKERS" ] && new_count=$MAX_WORKERS
|
||||||
|
log "SCALER: Healthy — scaling ${CURRENT_WORKERS} → ${new_count}"
|
||||||
|
for wid in $(seq $((CURRENT_WORKERS + 1)) "$new_count"); do
|
||||||
|
launch_worker "$wid"
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
CURRENT_WORKERS=$new_count
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|||||||
251
bin/loop-watchdog.sh
Normal file
251
bin/loop-watchdog.sh
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# loop-watchdog.sh — Self-healing monitor for all agent loops
|
||||||
|
# Runs every 2 minutes. Restarts dead loops, kills zombies,
|
||||||
|
# and files Gitea issues for problems it can't auto-fix.
|
||||||
|
#
|
||||||
|
# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh &
|
||||||
|
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
|
LOG="$LOG_DIR/watchdog.log"
|
||||||
|
ISSUE_LOG="$LOG_DIR/watchdog-issues.json" # tracks filed issues to avoid duplicates
|
||||||
|
GITEA_URL="http://143.198.27.163:3000"
|
||||||
|
ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)
|
||||||
|
ISSUE_REPO="rockachopa/hermes-agent" # ops issues go here
|
||||||
|
CHECK_INTERVAL=120 # 2 minutes
|
||||||
|
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
[ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"
|
||||||
|
}
|
||||||
|
|
||||||
|
# File a Gitea issue for problems that can't be auto-fixed.
|
||||||
|
# Deduplicates: won't file the same issue_key within 6 hours.
|
||||||
|
file_issue() {
|
||||||
|
local issue_key="$1"
|
||||||
|
local title="$2"
|
||||||
|
local body="$3"
|
||||||
|
local assignee="${4:-claude}"
|
||||||
|
|
||||||
|
# Check if we already filed this recently
|
||||||
|
local should_file
|
||||||
|
should_file=$(python3 -c "
|
||||||
|
import json, time
|
||||||
|
try:
|
||||||
|
with open('$ISSUE_LOG') as f: filed = json.load(f)
|
||||||
|
except: filed = {}
|
||||||
|
entry = filed.get('$issue_key', {})
|
||||||
|
if entry and entry.get('until', 0) > time.time():
|
||||||
|
print('no')
|
||||||
|
else:
|
||||||
|
filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''}
|
||||||
|
with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)
|
||||||
|
print('yes')
|
||||||
|
" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "$should_file" != "yes" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "FILING ISSUE: $title"
|
||||||
|
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
|
||||||
|
-H "Authorization: token ${ADMIN_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "$(python3 -c "
|
||||||
|
import json
|
||||||
|
print(json.dumps({
|
||||||
|
'title': '[watchdog] $title',
|
||||||
|
'body': '''$body
|
||||||
|
|
||||||
|
---
|
||||||
|
*Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''',
|
||||||
|
'assignees': ['$assignee'],
|
||||||
|
}))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
|
||||||
|
}
|
||||||
|
|
||||||
|
# === HEALTH CHECKS ===
|
||||||
|
|
||||||
|
check_loop() {
|
||||||
|
local name="$1" # kimi | claude | gemini
|
||||||
|
local grep_pat="$2" # pattern to find the loop process
|
||||||
|
local wake_cmd="$3" # command to restart
|
||||||
|
local log_file="$4" # log to check for errors
|
||||||
|
local worker_pat="${5:-}" # optional: pattern for worker processes
|
||||||
|
|
||||||
|
local pid
|
||||||
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
||||||
|
|
||||||
|
if [ -z "$pid" ]; then
|
||||||
|
log "$name loop DOWN — restarting..."
|
||||||
|
eval "$wake_cmd"
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
# Verify it came back
|
||||||
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
||||||
|
if [ -z "$pid" ]; then
|
||||||
|
file_issue \
|
||||||
|
"${name}-loop-dead" \
|
||||||
|
"${name} loop won't start" \
|
||||||
|
"The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \
|
||||||
|
"claude"
|
||||||
|
else
|
||||||
|
log "$name loop restarted (PID $pid)"
|
||||||
|
fi
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Loop is running — check for stalls
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
local last_activity
|
||||||
|
last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0)
|
||||||
|
local now
|
||||||
|
now=$(date +%s)
|
||||||
|
local stale_seconds=$(( now - last_activity ))
|
||||||
|
|
||||||
|
# If no log activity for 30 minutes, something is wrong
|
||||||
|
if [ "$stale_seconds" -gt 1800 ]; then
|
||||||
|
log "$name loop STALE — no activity for ${stale_seconds}s"
|
||||||
|
|
||||||
|
# Check if it's just idle (empty queue) vs truly stuck
|
||||||
|
local last_line
|
||||||
|
last_line=$(tail -1 "$log_file" 2>/dev/null)
|
||||||
|
if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then
|
||||||
|
# Just idle, that's fine
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill and restart
|
||||||
|
log "$name loop stuck — killing and restarting..."
|
||||||
|
pkill -f "$grep_pat" 2>/dev/null
|
||||||
|
[ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null
|
||||||
|
sleep 2
|
||||||
|
eval "$wake_cmd"
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
|
||||||
|
if [ -z "$pid" ]; then
|
||||||
|
file_issue \
|
||||||
|
"${name}-loop-stuck" \
|
||||||
|
"${name} loop stuck and won't restart" \
|
||||||
|
"The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \
|
||||||
|
"claude"
|
||||||
|
else
|
||||||
|
log "$name loop recovered (PID $pid)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for crash loops (5+ failures in last 50 lines)
|
||||||
|
local recent_failures
|
||||||
|
recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true)
|
||||||
|
if [ "$recent_failures" -ge 5 ]; then
|
||||||
|
local error_sample
|
||||||
|
error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5)
|
||||||
|
file_issue \
|
||||||
|
"${name}-crash-loop" \
|
||||||
|
"${name} agent in crash loop (${recent_failures} recent failures)" \
|
||||||
|
"The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \
|
||||||
|
"claude"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_gitea() {
|
||||||
|
if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then
|
||||||
|
log "Gitea UNREACHABLE"
|
||||||
|
file_issue \
|
||||||
|
"gitea-down" \
|
||||||
|
"Gitea instance unreachable" \
|
||||||
|
"The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \
|
||||||
|
"claude"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_zombies() {
|
||||||
|
local stuck_git
|
||||||
|
stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
|
||||||
|
local orphan_py
|
||||||
|
orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')
|
||||||
|
|
||||||
|
if [ "$stuck_git" -gt 3 ]; then
|
||||||
|
log "Killing $stuck_git stuck git processes"
|
||||||
|
pkill -f "git.*push\|git-remote-http" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$orphan_py" -gt 3 ]; then
|
||||||
|
log "Killing $orphan_py orphaned pytest processes"
|
||||||
|
pkill -f "pytest tests/" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_disk() {
|
||||||
|
local worktree_count
|
||||||
|
worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')
|
||||||
|
|
||||||
|
if [ "$worktree_count" -gt 30 ]; then
|
||||||
|
log "WARN: $worktree_count worktrees — possible leak"
|
||||||
|
file_issue \
|
||||||
|
"worktree-leak" \
|
||||||
|
"Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \
|
||||||
|
"There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \
|
||||||
|
"claude"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
check_skip_lists() {
|
||||||
|
# If all agents have full skip lists, the whole system is stuck
|
||||||
|
for agent in claude gemini kimi; do
|
||||||
|
local skip_file="$LOG_DIR/${agent}-skip-list.json"
|
||||||
|
[ -f "$skip_file" ] || continue
|
||||||
|
local skip_count
|
||||||
|
skip_count=$(python3 -c "
|
||||||
|
import json, time
|
||||||
|
try:
|
||||||
|
with open('$skip_file') as f: skips = json.load(f)
|
||||||
|
active = sum(1 for v in skips.values() if v.get('until',0) > time.time())
|
||||||
|
print(active)
|
||||||
|
except: print(0)
|
||||||
|
" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "${skip_count:-0}" -gt 10 ]; then
|
||||||
|
file_issue \
|
||||||
|
"${agent}-skip-overload" \
|
||||||
|
"${agent} has ${skip_count} skipped issues — systemic failure" \
|
||||||
|
"The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \
|
||||||
|
"claude"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# === MAIN ===
|
||||||
|
log "=== Watchdog Started ==="
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
# Gitea must be up for anything to work
|
||||||
|
check_gitea
|
||||||
|
|
||||||
|
# Check each agent loop
|
||||||
|
check_loop "kimi" "kimi-loop.sh" \
|
||||||
|
"nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \
|
||||||
|
"$LOG_DIR/kimi-loop.log" \
|
||||||
|
"kimi.*--print"
|
||||||
|
|
||||||
|
check_loop "claude" "claude-loop.sh" \
|
||||||
|
"nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \
|
||||||
|
"$LOG_DIR/claude-loop.log" \
|
||||||
|
"claude.*--print.*--dangerously"
|
||||||
|
|
||||||
|
check_loop "gemini" "gemini-loop.sh" \
|
||||||
|
"nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \
|
||||||
|
"$LOG_DIR/gemini-loop.log" \
|
||||||
|
"gemini.*-p"
|
||||||
|
|
||||||
|
# Housekeeping
|
||||||
|
check_zombies
|
||||||
|
check_disk
|
||||||
|
check_skip_lists
|
||||||
|
|
||||||
|
sleep "$CHECK_INTERVAL"
|
||||||
|
done
|
||||||
@@ -35,6 +35,10 @@ ops-help() {
|
|||||||
echo " ops-kill-gemini Stop Gemini loop"
|
echo " ops-kill-gemini Stop Gemini loop"
|
||||||
echo " ops-kill-zombies Kill stuck git/pytest"
|
echo " ops-kill-zombies Kill stuck git/pytest"
|
||||||
echo ""
|
echo ""
|
||||||
|
echo -e " \033[1mWatchdog\033[0m"
|
||||||
|
echo " ops-wake-watchdog Start loop watchdog"
|
||||||
|
echo " ops-kill-watchdog Stop loop watchdog"
|
||||||
|
echo ""
|
||||||
echo -e " \033[2m Type ops-help to see this again\033[0m"
|
echo -e " \033[2m Type ops-help to see this again\033[0m"
|
||||||
echo ""
|
echo ""
|
||||||
}
|
}
|
||||||
@@ -196,3 +200,15 @@ ops-kill-zombies() {
|
|||||||
done
|
done
|
||||||
echo " Killed $killed zombie processes"
|
echo " Killed $killed zombie processes"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ops-wake-watchdog() {
|
||||||
|
pkill -f "loop-watchdog.sh" 2>/dev/null
|
||||||
|
sleep 1
|
||||||
|
nohup bash ~/.hermes/bin/loop-watchdog.sh >> ~/.hermes/logs/watchdog.log 2>&1 &
|
||||||
|
echo " Watchdog started (PID $!)"
|
||||||
|
}
|
||||||
|
|
||||||
|
ops-kill-watchdog() {
|
||||||
|
pkill -f "loop-watchdog.sh" 2>/dev/null
|
||||||
|
echo " Watchdog stopped"
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user