bin/loop-watchdog.sh

#!/usr/bin/env bash
# loop-watchdog.sh — Self-healing monitor for all agent loops
# Runs every 2 minutes. Restarts dead loops, kills zombies,
# and files Gitea issues for problems it can't auto-fix.
#
# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh &

set -uo pipefail

LOG_DIR="$HOME/.hermes/logs"
LOG="$LOG_DIR/watchdog.log"
ISSUE_LOG="$LOG_DIR/watchdog-issues.json"  # tracks filed issues to avoid duplicates
GITEA_URL="http://143.198.27.163:3000"
ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)
ISSUE_REPO="rockachopa/hermes-agent"  # ops issues go here
CHECK_INTERVAL=120  # 2 minutes

mkdir -p "$LOG_DIR"
[ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG"

log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"
}

# File a Gitea issue for problems that can't be auto-fixed.
# Deduplicates: won't file the same issue_key within 6 hours.
file_issue() {
  local issue_key="$1"
  local title="$2"
  local body="$3"
  local assignee="${4:-claude}"

  # Check if we already filed this recently
  local should_file
  should_file=$(python3 -c "
import json, time
try:
    with open('$ISSUE_LOG') as f: filed = json.load(f)
except: filed = {}
entry = filed.get('$issue_key', {})
if entry and entry.get('until', 0) > time.time():
    print('no')
else:
    filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''}
    with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)
    print('yes')
" 2>/dev/null)

  if [ "$should_file" != "yes" ]; then
    return 0
  fi

  log "FILING ISSUE: $title"
  curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
    -H "Authorization: token ${ADMIN_TOKEN}" \
    -H "Content-Type: application/json" \
    -d "$(python3 -c "
import json
print(json.dumps({
    'title': '[watchdog] $title',
    'body': '''$body

---
*Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')*''',
    'assignees': ['$assignee'],
}))" 2>/dev/null)" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
}

# === HEALTH CHECKS ===

check_loop() {
  local name="$1"        # kimi | claude | gemini
  local grep_pat="$2"    # pattern to find the loop process
  local wake_cmd="$3"    # command to restart
  local log_file="$4"    # log to check for errors
  local worker_pat="${5:-}" # optional: pattern for worker processes

  local pid
  pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)

  if [ -z "$pid" ]; then
    log "$name loop DOWN — restarting..."
    eval "$wake_cmd"
    sleep 3

    # Verify it came back
    pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
    if [ -z "$pid" ]; then
      file_issue \
        "${name}-loop-dead" \
        "${name} loop won't start" \
        "The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \
        "claude"
    else
      log "$name loop restarted (PID $pid)"
    fi
    return
  fi

  # Loop is running — check for stalls
  if [ -f "$log_file" ]; then
    local last_activity
    last_activity=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0)
    local now
    now=$(date +%s)
    local stale_seconds=$(( now - last_activity ))

    # If no log activity for 30 minutes, something is wrong
    if [ "$stale_seconds" -gt 1800 ]; then
      log "$name loop STALE — no activity for ${stale_seconds}s"

      # Check if it's just idle (empty queue) vs truly stuck
      local last_line
      last_line=$(tail -1 "$log_file" 2>/dev/null)
      if echo "$last_line" | grep -q "Queue empty\|Waiting for assignments\|idle"; then
        # Just idle, that's fine
        return
      fi

      # Kill and restart
      log "$name loop stuck — killing and restarting..."
      pkill -f "$grep_pat" 2>/dev/null
      [ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null
      sleep 2
      eval "$wake_cmd"
      sleep 3

      pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
      if [ -z "$pid" ]; then
        file_issue \
          "${name}-loop-stuck" \
          "${name} loop stuck and won't restart" \
          "The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \
          "claude"
      else
        log "$name loop recovered (PID $pid)"
      fi
    fi

    # Check for crash loops (5+ failures in last 50 lines)
    local recent_failures
    recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true)
    if [ "$recent_failures" -ge 5 ]; then
      local error_sample
      error_sample=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5)
      file_issue \
        "${name}-crash-loop" \
        "${name} agent in crash loop (${recent_failures} recent failures)" \
        "The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \
        "claude"
    fi
  fi
}

check_gitea() {
  if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then
    log "Gitea UNREACHABLE"
    file_issue \
      "gitea-down" \
      "Gitea instance unreachable" \
      "The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \
      "claude"
  fi
}

check_zombies() {
  local stuck_git
  stuck_git=$(ps aux | grep "git.*push\|git-remote-http" | grep -v grep | wc -l | tr -d ' ')
  local orphan_py
  orphan_py=$(ps aux | grep "pytest tests/" | grep -v grep | wc -l | tr -d ' ')

  if [ "$stuck_git" -gt 3 ]; then
    log "Killing $stuck_git stuck git processes"
    pkill -f "git.*push\|git-remote-http" 2>/dev/null || true
  fi

  if [ "$orphan_py" -gt 3 ]; then
    log "Killing $orphan_py orphaned pytest processes"
    pkill -f "pytest tests/" 2>/dev/null || true
  fi
}

check_disk() {
  local worktree_count
  worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null | wc -l | tr -d ' ')

  if [ "$worktree_count" -gt 30 ]; then
    log "WARN: $worktree_count worktrees — possible leak"
    file_issue \
      "worktree-leak" \
      "Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \
      "There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null | head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \
      "claude"
  fi
}

check_skip_lists() {
  # If all agents have full skip lists, the whole system is stuck
  for agent in claude gemini kimi; do
    local skip_file="$LOG_DIR/${agent}-skip-list.json"
    [ -f "$skip_file" ] || continue
    local skip_count
    skip_count=$(python3 -c "
import json, time
try:
    with open('$skip_file') as f: skips = json.load(f)
    active = sum(1 for v in skips.values() if v.get('until',0) > time.time())
    print(active)
except: print(0)
" 2>/dev/null)

    if [ "${skip_count:-0}" -gt 10 ]; then
      file_issue \
        "${agent}-skip-overload" \
        "${agent} has ${skip_count} skipped issues — systemic failure" \
        "The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null | python3 -m json.tool 2>/dev/null | head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \
        "claude"
    fi
  done
}

# === MAIN ===
log "=== Watchdog Started ==="

while true; do
  # Gitea must be up for anything to work
  check_gitea

  # Check each agent loop
  check_loop "kimi" "kimi-loop.sh" \
    "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \
    "$LOG_DIR/kimi-loop.log" \
    "kimi.*--print"

  check_loop "claude" "claude-loop.sh" \
    "nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \
    "$LOG_DIR/claude-loop.log" \
    "claude.*--print.*--dangerously"

  check_loop "gemini" "gemini-loop.sh" \
    "nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \
    "$LOG_DIR/gemini-loop.log" \
    "gemini.*-p"

  # Housekeeping
  check_zombies
  check_disk
  check_skip_lists

  sleep "$CHECK_INTERVAL"
done
feat: parallel workers for all agents, dynamic scaling, self-healing watchdog - claude-loop: 7 workers default, scales up to 21, 5s cooldown - gemini-loop: rewritten as parallel worker system (3→12), multi-repo, auto-clone, correct CLI flags (-p/--yolo), bash 3.2 compatible - loop-watchdog: monitors all loops every 2min, auto-restarts dead loops, kills zombies, files Gitea issues for unfixable problems - ops-helpers: added ops-wake-watchdog, ops-kill-watchdog - All scripts use file-based PID tracking (bash 3.2 safe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-22 19:22:18 -04:00			`#!/usr/bin/env bash`
			`# loop-watchdog.sh — Self-healing monitor for all agent loops`
			`# Runs every 2 minutes. Restarts dead loops, kills zombies,`
			`# and files Gitea issues for problems it can't auto-fix.`
			`#`
			`# Usage: Run via cron or: nohup bash ~/.hermes/bin/loop-watchdog.sh &`

			`set -uo pipefail`

			`LOG_DIR="$HOME/.hermes/logs"`
			`LOG="$LOG_DIR/watchdog.log"`
			`ISSUE_LOG="$LOG_DIR/watchdog-issues.json" # tracks filed issues to avoid duplicates`
			`GITEA_URL="http://143.198.27.163:3000"`
			`ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)`
			`ISSUE_REPO="rockachopa/hermes-agent" # ops issues go here`
			`CHECK_INTERVAL=120 # 2 minutes`

			`mkdir -p "$LOG_DIR"`
			`[ -f "$ISSUE_LOG" ] \|\| echo '{}' > "$ISSUE_LOG"`

			`log() {`
			`echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"`
			`}`

			`# File a Gitea issue for problems that can't be auto-fixed.`
			`# Deduplicates: won't file the same issue_key within 6 hours.`
			`file_issue() {`
			`local issue_key="$1"`
			`local title="$2"`
			`local body="$3"`
			`local assignee="${4:-claude}"`

			`# Check if we already filed this recently`
			`local should_file`
			`should_file=$(python3 -c "`
			`import json, time`
			`try:`
			`with open('$ISSUE_LOG') as f: filed = json.load(f)`
			`except: filed = {}`
			`entry = filed.get('$issue_key', {})`
			`if entry and entry.get('until', 0) > time.time():`
			`print('no')`
			`else:`
			`filed['$issue_key'] = {'until': time.time() + 21600, 'title': '''$title'''}`
			`with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)`
			`print('yes')`
			`" 2>/dev/null)`

			`if [ "$should_file" != "yes" ]; then`
			`return 0`
			`fi`

			`log "FILING ISSUE: $title"`
			`curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \`
			`-H "Authorization: token ${ADMIN_TOKEN}" \`
			`-H "Content-Type: application/json" \`
			`-d "$(python3 -c "`
			`import json`
			`print(json.dumps({`
			`'title': '[watchdog] $title',`
			`'body': '''$body`

			`---`
			`Auto-filed by loop-watchdog at $(date '+%Y-%m-%d %H:%M:%S')''',`
			`'assignees': ['$assignee'],`
			`}))" 2>/dev/null)" >/dev/null 2>&1 \|\| log "WARN: Failed to file issue: $title"`
			`}`

			`# === HEALTH CHECKS ===`

			`check_loop() {`
			`local name="$1" # kimi \| claude \| gemini`
			`local grep_pat="$2" # pattern to find the loop process`
			`local wake_cmd="$3" # command to restart`
			`local log_file="$4" # log to check for errors`
			`local worker_pat="${5:-}" # optional: pattern for worker processes`

			`local pid`
			`pid=$(pgrep -f "$grep_pat" 2>/dev/null \| head -1)`

			`if [ -z "$pid" ]; then`
			`log "$name loop DOWN — restarting..."`
			`eval "$wake_cmd"`
			`sleep 3`

			`# Verify it came back`
			`pid=$(pgrep -f "$grep_pat" 2>/dev/null \| head -1)`
			`if [ -z "$pid" ]; then`
			`file_issue \`
			`"${name}-loop-dead" \`
			`"${name} loop won't start" \`
			"The ${name} agent loop failed to start after automatic restart attempt.\n\nRestart command: \`${wake_cmd}\`\n\nLast 20 lines of log:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`\n\nPlease investigate and fix the startup issue." \
			`"claude"`
			`else`
			`log "$name loop restarted (PID $pid)"`
			`fi`
			`return`
			`fi`

			`# Loop is running — check for stalls`
			`if [ -f "$log_file" ]; then`
			`local last_activity`
			`last_activity=$(stat -f %m "$log_file" 2>/dev/null \|\| stat -c %Y "$log_file" 2>/dev/null \|\| echo 0)`
			`local now`
			`now=$(date +%s)`
			`local stale_seconds=$(( now - last_activity ))`

			`# If no log activity for 30 minutes, something is wrong`
			`if [ "$stale_seconds" -gt 1800 ]; then`
			`log "$name loop STALE — no activity for ${stale_seconds}s"`

			`# Check if it's just idle (empty queue) vs truly stuck`
			`local last_line`
			`last_line=$(tail -1 "$log_file" 2>/dev/null)`
			`if echo "$last_line" \| grep -q "Queue empty\\|Waiting for assignments\\|idle"; then`
			`# Just idle, that's fine`
			`return`
			`fi`

			`# Kill and restart`
			`log "$name loop stuck — killing and restarting..."`
			`pkill -f "$grep_pat" 2>/dev/null`
			`[ -n "$worker_pat" ] && pkill -f "$worker_pat" 2>/dev/null`
			`sleep 2`
			`eval "$wake_cmd"`
			`sleep 3`

			`pid=$(pgrep -f "$grep_pat" 2>/dev/null \| head -1)`
			`if [ -z "$pid" ]; then`
			`file_issue \`
			`"${name}-loop-stuck" \`
			`"${name} loop stuck and won't restart" \`
			"The ${name} loop was stale (${stale_seconds}s no activity) and failed to restart.\n\nLast 20 lines:\n\`\`\`\n$(tail -20 "$log_file" 2>/dev/null)\n\`\`\`" \
			`"claude"`
			`else`
			`log "$name loop recovered (PID $pid)"`
			`fi`
			`fi`

			`# Check for crash loops (5+ failures in last 50 lines)`
			`local recent_failures`
			`recent_failures=$(tail -50 "$log_file" 2>/dev/null \| grep -c "FAILED:\\|ERROR:" \|\| true)`
			`if [ "$recent_failures" -ge 5 ]; then`
			`local error_sample`
			`error_sample=$(tail -50 "$log_file" 2>/dev/null \| grep "FAILED:\\|ERROR:" \| tail -5)`
			`file_issue \`
			`"${name}-crash-loop" \`
			`"${name} agent in crash loop (${recent_failures} recent failures)" \`
			"The ${name} agent loop has ${recent_failures} failures in its last 50 log lines, suggesting a systemic problem.\n\nRecent errors:\n\`\`\`\n${error_sample}\n\`\`\`\n\nPlease investigate the root cause — could be API issues, bad repo state, or CLI problems." \
			`"claude"`
			`fi`
			`fi`
			`}`

			`check_gitea() {`
			`if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then`
			`log "Gitea UNREACHABLE"`
			`file_issue \`
			`"gitea-down" \`
			`"Gitea instance unreachable" \`
			`"The Gitea instance at ${GITEA_URL} is not responding. All agent loops depend on it.\n\nThis needs immediate attention — check the VPS at 143.198.27.163." \`
			`"claude"`
			`fi`
			`}`

			`check_zombies() {`
			`local stuck_git`
			`stuck_git=$(ps aux \| grep "git.*push\\|git-remote-http" \| grep -v grep \| wc -l \| tr -d ' ')`
			`local orphan_py`
			`orphan_py=$(ps aux \| grep "pytest tests/" \| grep -v grep \| wc -l \| tr -d ' ')`

			`if [ "$stuck_git" -gt 3 ]; then`
			`log "Killing $stuck_git stuck git processes"`
			`pkill -f "git.*push\\|git-remote-http" 2>/dev/null \|\| true`
			`fi`

			`if [ "$orphan_py" -gt 3 ]; then`
			`log "Killing $orphan_py orphaned pytest processes"`
			`pkill -f "pytest tests/" 2>/dev/null \|\| true`
			`fi`
			`}`

			`check_disk() {`
			`local worktree_count`
			`worktree_count=$(find "$HOME/worktrees" -maxdepth 1 -type d 2>/dev/null \| wc -l \| tr -d ' ')`

			`if [ "$worktree_count" -gt 30 ]; then`
			`log "WARN: $worktree_count worktrees — possible leak"`
			`file_issue \`
			`"worktree-leak" \`
			`"Worktree accumulation: ${worktree_count} dirs in ~/worktrees" \`
			"There are ${worktree_count} worktree directories, suggesting cleanup is failing.\n\nList:\n\`\`\`\n$(ls -1 "$HOME/worktrees/" 2>/dev/null \| head -30)\n\`\`\`\n\nPlease investigate which loops are leaking worktrees and fix the cleanup." \
			`"claude"`
			`fi`
			`}`

			`check_skip_lists() {`
			`# If all agents have full skip lists, the whole system is stuck`
			`for agent in claude gemini kimi; do`
			`local skip_file="$LOG_DIR/${agent}-skip-list.json"`
			`[ -f "$skip_file" ] \|\| continue`
			`local skip_count`
			`skip_count=$(python3 -c "`
			`import json, time`
			`try:`
			`with open('$skip_file') as f: skips = json.load(f)`
			`active = sum(1 for v in skips.values() if v.get('until',0) > time.time())`
			`print(active)`
			`except: print(0)`
			`" 2>/dev/null)`

			`if [ "${skip_count:-0}" -gt 10 ]; then`
			`file_issue \`
			`"${agent}-skip-overload" \`
			`"${agent} has ${skip_count} skipped issues — systemic failure" \`
			"The ${agent} agent has ${skip_count} issues in its skip list, meaning most of its queue is blocked.\n\nSkip list contents:\n\`\`\`\n$(cat "$skip_file" 2>/dev/null \| python3 -m json.tool 2>/dev/null \| head -40)\n\`\`\`\n\nThis likely indicates a systemic problem (broken tests, bad config, API changes)." \
			`"claude"`
			`fi`
			`done`
			`}`

			`# === MAIN ===`
			`log "=== Watchdog Started ==="`

			`while true; do`
			`# Gitea must be up for anything to work`
			`check_gitea`

			`# Check each agent loop`
			`check_loop "kimi" "kimi-loop.sh" \`
			`"nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \`
			`"$LOG_DIR/kimi-loop.log" \`
			`"kimi.*--print"`

			`check_loop "claude" "claude-loop.sh" \`
			`"nohup bash ~/.hermes/bin/claude-loop.sh 3 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \`
			`"$LOG_DIR/claude-loop.log" \`
			`"claude.--print.--dangerously"`

			`check_loop "gemini" "gemini-loop.sh" \`
			`"nohup bash ~/.hermes/bin/gemini-loop.sh >> ~/.hermes/logs/gemini-loop.log 2>&1 &" \`
			`"$LOG_DIR/gemini-loop.log" \`
			`"gemini.*-p"`

			`# Housekeeping`
			`check_zombies`
			`check_disk`
			`check_skip_lists`

			`sleep "$CHECK_INTERVAL"`
			`done`