localgrep_pat="$2"# pattern to find the loop process
localwake_cmd="$3"# command to restart
locallog_file="$4"# log to check for errors
localworker_pat="${5:-}"# optional: pattern for worker processes
local pid
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
if[ -z "$pid"];then
log "$name loop DOWN — restarting..."
eval"$wake_cmd"
sleep 3
# Verify it came back
pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
if[ -z "$pid"];then
file_issue \
"${name}-loop-dead"\
"${name} loop won't start"\
"The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue."\
"The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`"\
"claude"
else
log "$name loop recovered (PID $pid)"
fi
fi
# Check for crash loops (5+ failures in last 50 lines)
"${name} agent in crash loop (${recent_failures} recent failures)"\
"The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems."\
"claude"
fi
fi
}
check_gitea(){
if ! curl -sf --max-time 5"${GITEA_URL}/api/v1/version" >/dev/null 2>&1;then
log "Gitea UNREACHABLE"
file_issue \
"gitea-down"\
"Gitea instance unreachable"\
"The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163."\
log "WARN: $worktree_count worktrees — possible leak"
file_issue \
"worktree-leak"\
"Worktree accumulation: ${worktree_count} dirs in ~/worktrees"\
"There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup."\
"claude"
fi
}
check_skip_lists(){
# If all agents have full skip lists, the whole system is stuck
for agent in claude gemini kimi;do
localskip_file="$LOG_DIR/${agent}-skip-list.json"
[ -f "$skip_file"]||continue
local skip_count
skip_count=$(python3 -c "
import json, time
try:
with open('$skip_file') as f: skips= json.load(f)
active= sum(1for v in skips.values()if v.get('until',0) > time.time())
print(active)
except: print(0)
" 2>/dev/null)
if["${skip_count:-0}" -gt 10];then
file_issue \
"${agent}-skip-overload"\
"${agent} has ${skip_count} skipped issues — systemic failure"\
"The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)."\