From 64a8ffa329f8c8fc45703da5eefe8940e246ee4a Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sun, 22 Mar 2026 22:03:58 -0400 Subject: [PATCH] fix: watchdog single-instance guard, stop killing worker processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pidfile guard prevents duplicate watchdog instances - Removed check_zombies — was killing legitimate git push/clone from active workers (caused mass exit 143 failures) - Replaced with check_disk that cleans clone dirs >1hr old - Disabled gemini loop check (no API key configured) - Workers use fresh clone per issue (no shared worktree contention) - Simplified cleanup to rm -rf (no git worktree bookkeeping) - Tested file_issue end-to-end — confirmed working Co-Authored-By: Claude Opus 4.6 --- bin/loop-watchdog.sh | 226 +++++++++++++------------------------------ 1 file changed, 67 insertions(+), 159 deletions(-) diff --git a/bin/loop-watchdog.sh b/bin/loop-watchdog.sh index d8cf3bf..32b33f4 100644 --- a/bin/loop-watchdog.sh +++ b/bin/loop-watchdog.sh @@ -1,36 +1,45 @@ #!/usr/bin/env bash -# loop-watchdog.sh — Self-healing monitor for all agent loops -# Runs every 2 minutes. Restarts dead loops, kills zombies, -# and files Gitea issues for problems it can't auto-fix. -# -# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh & +# loop-watchdog.sh — Self-healing monitor for agent loops +# Single instance enforced via pidfile. Checks every 2 minutes. +# Restarts dead loops. Files Gitea issues for persistent failures. +# Does NOT kill git processes — workers handle their own cleanup. set -uo pipefail LOG_DIR="$HOME/.hermes/logs" LOG="$LOG_DIR/watchdog.log" -ISSUE_LOG="$LOG_DIR/watchdog-issues.json" # tracks filed issues to avoid duplicates +PIDFILE="$LOG_DIR/watchdog.pid" +ISSUE_LOG="$LOG_DIR/watchdog-issues.json" GITEA_URL="http://143.198.27.163:3000" ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null) -ISSUE_REPO="rockachopa/hermes-agent" # ops issues go here -CHECK_INTERVAL=120 # 2 minutes +ISSUE_REPO="rockachopa/hermes-agent" +CHECK_INTERVAL=120 mkdir -p "$LOG_DIR" [ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG" +# === Single instance guard === +if [ -f "$PIDFILE" ]; then + old_pid=$(cat "$PIDFILE") + if kill -0 "$old_pid" 2>/dev/null; then + echo "Watchdog already running (PID $old_pid)" >&2 + exit 0 + fi +fi +echo $$ > "$PIDFILE" +trap 'rm -f "$PIDFILE"' EXIT + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG" } -# File a Gitea issue for problems that can't be auto-fixed. -# Deduplicates: won't file the same issue_key within 6 hours. +# File a Gitea issue. Deduplicates within 6 hours. file_issue() { local issue_key="$1" local title="$2" local body="$3" local assignee="${4:-claude}" - # Check if we already filed this recently local should_file should_file=$(python3 -c " import json, time @@ -41,27 +50,22 @@ entry = filed.get('$issue_key', {}) if entry and entry.get('until', 0) > time.time(): print('no') else: - filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''} + filed['$issue_key'] = {'until': time.time() + 21600} with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2) print('yes') " 2>/dev/null) - if [ "$should_file" != "yes" ]; then - return 0 - fi + [ "$should_file" != "yes" ] && return 0 log "FILING ISSUE: $title" local tmpfile="/tmp/watchdog-issue-$$.json" python3 -c " import json, sys -title = sys.argv[1] -body = sys.argv[2] -assignee = sys.argv[3] with open('$tmpfile', 'w') as f: json.dump({ - 'title': '[watchdog] ' + title, - 'body': body + '\n\n---\n*Auto-filed by loop-watchdog*', - 'assignees': [assignee], + 'title': '[watchdog] ' + sys.argv[1], + 'body': sys.argv[2] + '\n\n---\n*Auto-filed by loop-watchdog*', + 'assignees': [sys.argv[3]], }, f) " "$title" "$body" "$assignee" 2>/dev/null @@ -74,14 +78,12 @@ with open('$tmpfile', 'w') as f: fi } -# === HEALTH CHECKS === - +# Check if a loop is alive. Restart if dead. check_loop() { - local name="$1" # kimi | claude | gemini - local grep_pat="$2" # pattern to find the loop process - local wake_cmd="$3" # command to restart - local log_file="$4" # log to check for errors - local worker_pat="${5:-}" # optional: pattern for worker processes + local name="$1" + local grep_pat="$2" + local wake_cmd="$3" + local log_file="$4" local pid pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) @@ -89,15 +91,12 @@ check_loop() { if [ -z "$pid" ]; then log "$name loop DOWN — restarting..." eval "$wake_cmd" - sleep 3 - - # Verify it came back + sleep 5 pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) if [ -z "$pid" ]; then - file_issue \ - "${name}-loop-dead" \ + file_issue "${name}-loop-dead" \ "${name} loop won't start" \ - "The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \ + "The ${name} agent loop failed to start.\nCommand: ${wake_cmd}\nLast log: $(tail -10 "$log_file" 2>/dev/null)" \ "claude" else log "$name loop restarted (PID $pid)" @@ -105,56 +104,34 @@ check_loop() { return fi - # Loop is running — check for stalls + # Check for stalls (no log activity > 30 min, and not idle) if [ -f "$log_file" ]; then - local last_activity - last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0) - local now + local last_mod now stale + last_mod=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0) now=$(date +%s) - local stale_seconds=$(( now - last_activity )) + stale=$(( now - last_mod )) - # If no log activity for 30 minutes, something is wrong - if [ "$stale_seconds" -gt 1800 ]; then - log "$name loop STALE — no activity for ${stale_seconds}s" - - # Check if it's just idle (empty queue) vs truly stuck + if [ "$stale" -gt 1800 ]; then local last_line last_line=$(tail -1 "$log_file" 2>/dev/null) - if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then - # Just idle, that's fine - return - fi - - # Kill and restart - log "$name loop stuck — killing and restarting..." - pkill -f "$grep_pat" 2>/dev/null - [ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null - sleep 2 - eval "$wake_cmd" - sleep 3 - - pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) - if [ -z "$pid" ]; then - file_issue \ - "${name}-loop-stuck" \ - "${name} loop stuck and won't restart" \ - "The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \ - "claude" - else - log "$name loop recovered (PID $pid)" + # Idle is fine, truly stuck is not + if ! echo "$last_line" | grep -q "Queue empty\|Waiting\|idle"; then + log "$name loop stale (${stale}s) — restarting" + pkill -f "$grep_pat" 2>/dev/null + sleep 2 + eval "$wake_cmd" fi fi - # Check for crash loops (5+ failures in last 50 lines) + # Check for crash loops (10+ failures in last 50 lines) local recent_failures recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true) - if [ "$recent_failures" -ge 5 ]; then - local error_sample - error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5) - file_issue \ - "${name}-crash-loop" \ - "${name} agent in crash loop (${recent_failures} recent failures)" \ - "The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \ + if [ "$recent_failures" -ge 10 ]; then + local errors + errors=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5) + file_issue "${name}-crash-loop" \ + "${name} in crash loop (${recent_failures} failures in 50 lines)" \ + "Recent errors:\n${errors}" \ "claude" fi fi @@ -163,108 +140,39 @@ check_loop() { check_gitea() { if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then log "Gitea UNREACHABLE" - file_issue \ - "gitea-down" \ - "Gitea instance unreachable" \ - "The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \ - "claude" + file_issue "gitea-down" "Gitea unreachable" \ + "Gitea at ${GITEA_URL} not responding. Check VPS 143.198.27.163." "claude" fi } -check_zombies() { - # Only kill git/pytest processes older than 5 minutes (300 seconds) - # Normal pushes from workers should complete in under a minute - local killed=0 - for pid in $(ps -eo pid,etime,command | grep -E "git.*push|git-remote-http" | grep -v grep | awk '{ - split($2, t, /[:-]/); - if (length(t)==3) secs=t[1]*3600+t[2]*60+t[3]; - else if (length(t)==2) secs=t[1]*60+t[2]; - else secs=t[1]; - if (secs > 300) print $1 - }'); do - kill "$pid" 2>/dev/null && killed=$((killed + 1)) - done - [ "$killed" -gt 0 ] && log "Killed $killed stuck git processes (>5min old)" - - local killed_py=0 - for pid in $(ps -eo pid,etime,command | grep "pytest tests/" | grep -v grep | awk '{ - split($2, t, /[:-]/); - if (length(t)==3) secs=t[1]*3600+t[2]*60+t[3]; - else if (length(t)==2) secs=t[1]*60+t[2]; - else secs=t[1]; - if (secs > 300) print $1 - }'); do - kill "$pid" 2>/dev/null && killed_py=$((killed_py + 1)) - done - [ "$killed_py" -gt 0 ] && log "Killed $killed_py orphaned pytest processes (>5min old)" -} - check_disk() { - local worktree_count - worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ') - - if [ "$worktree_count" -gt 30 ]; then - log "WARN: $worktree_count worktrees — possible leak" - file_issue \ - "worktree-leak" \ - "Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \ - "There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \ - "claude" + local dir_count + dir_count=$(ls -1d "$HOME/worktrees"/claude-w* 2>/dev/null | wc -l | tr -d ' ') + if [ "${dir_count:-0}" -gt 40 ]; then + log "WARN: $dir_count clone dirs — cleaning old ones" + # Delete clone dirs not modified in the last hour + find "$HOME/worktrees" -maxdepth 1 -name "claude-w*" -type d -mmin +60 -exec rm -rf {} \; 2>/dev/null fi } -check_skip_lists() { - # If all agents have full skip lists, the whole system is stuck - for agent in claude gemini kimi; do - local skip_file="$LOG_DIR/${agent}-skip-list.json" - [ -f "$skip_file" ] || continue - local skip_count - skip_count=$(python3 -c " -import json, time -try: - with open('$skip_file') as f: skips = json.load(f) - active = sum(1 for v in skips.values() if v.get('until',0) > time.time()) - print(active) -except: print(0) -" 2>/dev/null) - - if [ "${skip_count:-0}" -gt 10 ]; then - file_issue \ - "${agent}-skip-overload" \ - "${agent} has ${skip_count} skipped issues — systemic failure" \ - "The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \ - "claude" - fi - done -} - # === MAIN === -log "=== Watchdog Started ===" +log "=== Watchdog Started (PID $$) ===" while true; do - # Gitea must be up for anything to work check_gitea - # Check each agent loop + check_loop "claude" "claude-loop.sh" \ + "nohup bash ~/.hermes/bin/claude-loop.sh 10 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \ + "$LOG_DIR/claude-loop.log" + check_loop "kimi" "kimi-loop.sh" \ "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \ - "$LOG_DIR/kimi-loop.log" \ - "kimi.*--print" + "$LOG_DIR/kimi-loop.log" - check_loop "claude" "claude-loop.sh" \ - "nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \ - "$LOG_DIR/claude-loop.log" \ - "claude.*--print.*--dangerously" + # Gemini disabled — no API key configured + # check_loop "gemini" ... - check_loop "gemini" "gemini-loop.sh" \ - "nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \ - "$LOG_DIR/gemini-loop.log" \ - "gemini.*-p" - - # Housekeeping - check_zombies check_disk - check_skip_lists sleep "$CHECK_INTERVAL" done