fix: replace worktrees with fresh clones, fix watchdog issue filing
- Workers now git clone --depth=1 per issue instead of sharing base repos with worktrees. Eliminates all contention and branch collision errors. - Watchdog file_issue uses temp file + sys.argv for safe JSON escaping - Watchdog zombie detection only kills processes >5min old (was killing legitimate git pushes from active workers) - Simplified cleanup to plain rm -rf (no worktree bookkeeping needed) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -91,19 +91,9 @@ with open('$ACTIVE_FILE', 'r+') as f:
|
|||||||
" 2>/dev/null
|
" 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_worktree() {
|
cleanup_workdir() {
|
||||||
local wt="$1"
|
local wt="$1"
|
||||||
local branch="$2"
|
rm -rf "$wt" 2>/dev/null || true
|
||||||
if [ -d "$wt" ]; then
|
|
||||||
local parent
|
|
||||||
parent=$(git -C "$wt" rev-parse --git-common-dir 2>/dev/null | sed 's|/.git$||' || true)
|
|
||||||
if [ -n "$parent" ] && [ -d "$parent" ]; then
|
|
||||||
cd "$parent"
|
|
||||||
fi
|
|
||||||
git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt"
|
|
||||||
git worktree prune 2>/dev/null
|
|
||||||
git branch -D "$branch" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
get_next_issue() {
|
get_next_issue() {
|
||||||
@@ -303,41 +293,19 @@ run_worker() {
|
|||||||
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||||
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
||||||
|
|
||||||
# Ensure local clone
|
# Fresh clone per issue — no shared state, no contention
|
||||||
local_repo="${WORKTREE_BASE}/claude-base-${repo_owner}-${repo_name}"
|
rm -rf "$worktree" 2>/dev/null
|
||||||
if [ ! -d "$local_repo" ]; then
|
if ! git clone --depth=1 -b main \
|
||||||
log "WORKER-${worker_id}: Cloning ${repo_owner}/${repo_name}..."
|
"http://claude:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" \
|
||||||
git clone --depth=1 "http://claude:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" "$local_repo" 2>&1 || {
|
"$worktree" >/dev/null 2>&1; then
|
||||||
log "WORKER-${worker_id}: ERROR cloning"
|
log "WORKER-${worker_id}: ERROR cloning for #${issue_num}"
|
||||||
unlock_issue "$issue_key"
|
|
||||||
consecutive_failures=$((consecutive_failures + 1))
|
|
||||||
sleep "$COOLDOWN"
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
cd "$local_repo"
|
|
||||||
git fetch --unshallow origin main 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Fetch latest
|
|
||||||
cd "$local_repo"
|
|
||||||
timeout 60 git fetch origin main 2>/dev/null || true
|
|
||||||
git checkout main 2>/dev/null || true
|
|
||||||
git reset --hard origin/main 2>/dev/null || true
|
|
||||||
|
|
||||||
# Create worktree
|
|
||||||
[ -d "$worktree" ] && cleanup_worktree "$worktree" "$branch"
|
|
||||||
cd "$local_repo"
|
|
||||||
|
|
||||||
if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
|
|
||||||
log "WORKER-${worker_id}: ERROR creating worktree"
|
|
||||||
unlock_issue "$issue_key"
|
unlock_issue "$issue_key"
|
||||||
consecutive_failures=$((consecutive_failures + 1))
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
sleep "$COOLDOWN"
|
sleep "$COOLDOWN"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cd "$worktree"
|
cd "$worktree"
|
||||||
git remote set-url origin "http://claude:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
git checkout -b "$branch" >/dev/null 2>&1
|
||||||
|
|
||||||
# Build prompt and run
|
# Build prompt and run
|
||||||
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
||||||
@@ -399,7 +367,7 @@ else: print('')
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
cleanup_worktree "$worktree" "$branch"
|
cleanup_workdir "$worktree"
|
||||||
unlock_issue "$issue_key"
|
unlock_issue "$issue_key"
|
||||||
update_active "$worker_id" "" "" "done"
|
update_active "$worker_id" "" "" "done"
|
||||||
|
|
||||||
|
|||||||
@@ -85,16 +85,9 @@ with open('$ACTIVE_FILE', 'r+') as f:
|
|||||||
" 2>/dev/null
|
" 2>/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup_worktree() {
|
cleanup_workdir() {
|
||||||
local wt="$1" branch="$2"
|
local wt="$1"
|
||||||
if [ -d "$wt" ]; then
|
rm -rf "$wt" 2>/dev/null || true
|
||||||
local parent
|
|
||||||
parent=$(git -C "$wt" rev-parse --git-common-dir 2>/dev/null | sed 's|/.git$||' || true)
|
|
||||||
[ -n "$parent" ] && [ -d "$parent" ] && cd "$parent"
|
|
||||||
git worktree remove --force "$wt" 2>/dev/null || rm -rf "$wt"
|
|
||||||
git worktree prune 2>/dev/null
|
|
||||||
git branch -D "$branch" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
get_next_issue() {
|
get_next_issue() {
|
||||||
@@ -283,39 +276,19 @@ run_worker() {
|
|||||||
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
log "WORKER-${worker_id}: === ISSUE #${issue_num}: ${issue_title} (${repo_owner}/${repo_name}) ==="
|
||||||
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
update_active "$worker_id" "$issue_num" "${repo_owner}/${repo_name}" "working"
|
||||||
|
|
||||||
# Ensure local clone
|
# Fresh clone per issue — no shared state, no contention
|
||||||
local_repo="${WORKTREE_BASE}/gemini-base-${repo_owner}-${repo_name}"
|
rm -rf "$worktree" 2>/dev/null
|
||||||
if [ ! -d "$local_repo" ]; then
|
if ! git clone --depth=1 -b main \
|
||||||
log "WORKER-${worker_id}: Cloning ${repo_owner}/${repo_name}..."
|
"http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" \
|
||||||
git clone --depth=1 "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git" "$local_repo" 2>&1 || {
|
"$worktree" >/dev/null 2>&1; then
|
||||||
log "WORKER-${worker_id}: ERROR cloning"
|
log "WORKER-${worker_id}: ERROR cloning for #${issue_num}"
|
||||||
unlock_issue "$issue_key"
|
|
||||||
consecutive_failures=$((consecutive_failures + 1))
|
|
||||||
sleep "$COOLDOWN"
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
cd "$local_repo"
|
|
||||||
git fetch --unshallow origin main 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
cd "$local_repo"
|
|
||||||
timeout 60 git fetch origin main 2>/dev/null || true
|
|
||||||
git checkout main 2>/dev/null || true
|
|
||||||
git reset --hard origin/main 2>/dev/null || true
|
|
||||||
|
|
||||||
[ -d "$worktree" ] && cleanup_worktree "$worktree" "$branch"
|
|
||||||
cd "$local_repo"
|
|
||||||
|
|
||||||
if ! git worktree add "$worktree" -b "$branch" origin/main 2>&1; then
|
|
||||||
log "WORKER-${worker_id}: ERROR creating worktree"
|
|
||||||
unlock_issue "$issue_key"
|
unlock_issue "$issue_key"
|
||||||
consecutive_failures=$((consecutive_failures + 1))
|
consecutive_failures=$((consecutive_failures + 1))
|
||||||
sleep "$COOLDOWN"
|
sleep "$COOLDOWN"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cd "$worktree"
|
cd "$worktree"
|
||||||
git remote set-url origin "http://gemini:${GITEA_TOKEN}@143.198.27.163:3000/${repo_owner}/${repo_name}.git"
|
git checkout -b "$branch" >/dev/null 2>&1
|
||||||
|
|
||||||
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
prompt=$(build_prompt "$issue_num" "$issue_title" "$worktree" "$repo_owner" "$repo_name")
|
||||||
|
|
||||||
@@ -372,7 +345,7 @@ else: print('')
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cleanup_worktree "$worktree" "$branch"
|
cleanup_workdir "$worktree"
|
||||||
unlock_issue "$issue_key"
|
unlock_issue "$issue_key"
|
||||||
update_active "$worker_id" "" "" "done"
|
update_active "$worker_id" "" "" "done"
|
||||||
|
|
||||||
|
|||||||
@@ -51,19 +51,27 @@ else:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
log "FILING ISSUE: $title"
|
log "FILING ISSUE: $title"
|
||||||
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
|
local tmpfile="/tmp/watchdog-issue-$$.json"
|
||||||
-H "Authorization: token ${ADMIN_TOKEN}" \
|
python3 -c "
|
||||||
-H "Content-Type: application/json" \
|
import json, sys
|
||||||
-d "$(python3 -c "
|
title = sys.argv[1]
|
||||||
import json
|
body = sys.argv[2]
|
||||||
print(json.dumps({
|
assignee = sys.argv[3]
|
||||||
'title': '[watchdog] $title',
|
with open('$tmpfile', 'w') as f:
|
||||||
'body': '''$body
|
json.dump({
|
||||||
|
'title': '[watchdog] ' + title,
|
||||||
|
'body': body + '\n\n---\n*Auto-filed by loop-watchdog*',
|
||||||
|
'assignees': [assignee],
|
||||||
|
}, f)
|
||||||
|
" "$title" "$body" "$assignee" 2>/dev/null
|
||||||
|
|
||||||
---
|
if [ -f "$tmpfile" ]; then
|
||||||
*Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''',
|
curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
|
||||||
'assignees': ['$assignee'],
|
-H "Authorization: token ${ADMIN_TOKEN}" \
|
||||||
}))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
|
-H "Content-Type: application/json" \
|
||||||
|
-d @"$tmpfile" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
|
||||||
|
rm -f "$tmpfile"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# === HEALTH CHECKS ===
|
# === HEALTH CHECKS ===
|
||||||
@@ -164,20 +172,31 @@ check_gitea() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
check_zombies() {
|
check_zombies() {
|
||||||
local stuck_git
|
# Only kill git/pytest processes older than 5 minutes (300 seconds)
|
||||||
stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
|
# Normal pushes from workers should complete in under a minute
|
||||||
local orphan_py
|
local killed=0
|
||||||
orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')
|
for pid in $(ps -eo pid,etime,command | grep -E "git.*push|git-remote-http" | grep -v grep | awk '{
|
||||||
|
split($2, t, /[:-]/);
|
||||||
|
if (length(t)==3) secs=t[1]*3600+t[2]*60+t[3];
|
||||||
|
else if (length(t)==2) secs=t[1]*60+t[2];
|
||||||
|
else secs=t[1];
|
||||||
|
if (secs > 300) print $1
|
||||||
|
}'); do
|
||||||
|
kill "$pid" 2>/dev/null && killed=$((killed + 1))
|
||||||
|
done
|
||||||
|
[ "$killed" -gt 0 ] && log "Killed $killed stuck git processes (>5min old)"
|
||||||
|
|
||||||
if [ "$stuck_git" -gt 3 ]; then
|
local killed_py=0
|
||||||
log "Killing $stuck_git stuck git processes"
|
for pid in $(ps -eo pid,etime,command | grep "pytest tests/" | grep -v grep | awk '{
|
||||||
pkill -f "git.*push\|git-remote-http" 2>/dev/null || true
|
split($2, t, /[:-]/);
|
||||||
fi
|
if (length(t)==3) secs=t[1]*3600+t[2]*60+t[3];
|
||||||
|
else if (length(t)==2) secs=t[1]*60+t[2];
|
||||||
if [ "$orphan_py" -gt 3 ]; then
|
else secs=t[1];
|
||||||
log "Killing $orphan_py orphaned pytest processes"
|
if (secs > 300) print $1
|
||||||
pkill -f "pytest tests/" 2>/dev/null || true
|
}'); do
|
||||||
fi
|
kill "$pid" 2>/dev/null && killed_py=$((killed_py + 1))
|
||||||
|
done
|
||||||
|
[ "$killed_py" -gt 0 ] && log "Killed $killed_py orphaned pytest processes (>5min old)"
|
||||||
}
|
}
|
||||||
|
|
||||||
check_disk() {
|
check_disk() {
|
||||||
|
|||||||
Reference in New Issue
Block a user