hermes-config/bin/loop-watchdog.sh

#!/usr/bin/env bash
# loop-watchdog.sh — Self-healing monitor for agent loops
# Single instance enforced via pidfile. Checks every 2 minutes.
# Restarts dead loops. Files Gitea issues for persistent failures.
# Does NOT kill git processes — workers handle their own cleanup.

set -uo pipefail

LOG_DIR="$HOME/.hermes/logs"
LOG="$LOG_DIR/watchdog.log"
PIDFILE="$LOG_DIR/watchdog.pid"
ISSUE_LOG="$LOG_DIR/watchdog-issues.json"
GITEA_URL="http://143.198.27.163:3000"
ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null)
ISSUE_REPO="rockachopa/hermes-agent"
CHECK_INTERVAL=120

mkdir -p "$LOG_DIR"
[ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG"

# === Single instance guard ===
if [ -f "$PIDFILE" ]; then
  old_pid=$(cat "$PIDFILE")
  if kill -0 "$old_pid" 2>/dev/null; then
    echo "Watchdog already running (PID $old_pid)" >&2
    exit 0
  fi
fi
echo $$ > "$PIDFILE"
trap 'rm -f "$PIDFILE"' EXIT

log() {
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG"
}

# File a Gitea issue. Deduplicates within 6 hours.
file_issue() {
  local issue_key="$1"
  local title="$2"
  local body="$3"
  local assignee="${4:-claude}"

  local should_file
  should_file=$(python3 -c "
import json, time
try:
    with open('$ISSUE_LOG') as f: filed = json.load(f)
except: filed = {}
entry = filed.get('$issue_key', {})
if entry and entry.get('until', 0) > time.time():
    print('no')
else:
    filed['$issue_key'] = {'until': time.time() + 21600}
    with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2)
    print('yes')
" 2>/dev/null)

  [ "$should_file" != "yes" ] && return 0

  log "FILING ISSUE: $title"
  local tmpfile="/tmp/watchdog-issue-$$.json"
  python3 -c "
import json, sys
with open('$tmpfile', 'w') as f:
    json.dump({
        'title': '[watchdog] ' + sys.argv[1],
        'body': sys.argv[2] + '\n\n---\n*Auto-filed by loop-watchdog*',
        'assignees': [sys.argv[3]],
    }, f)
" "$title" "$body" "$assignee" 2>/dev/null

  if [ -f "$tmpfile" ]; then
    curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \
      -H "Authorization: token ${ADMIN_TOKEN}" \
      -H "Content-Type: application/json" \
      -d @"$tmpfile" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title"
    rm -f "$tmpfile"
  fi
}

# Check if a loop is alive. Restart if dead.
check_loop() {
  local name="$1"
  local grep_pat="$2"
  local wake_cmd="$3"
  local log_file="$4"

  local pid
  pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)

  if [ -z "$pid" ]; then
    log "$name loop DOWN — restarting..."
    eval "$wake_cmd"
    sleep 5
    pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1)
    if [ -z "$pid" ]; then
      file_issue "${name}-loop-dead" \
        "${name} loop won't start" \
        "The ${name} agent loop failed to start.\nCommand: ${wake_cmd}\nLast log: $(tail -10 "$log_file" 2>/dev/null)" \
        "claude"
    else
      log "$name loop restarted (PID $pid)"
    fi
    return
  fi

  # Check for stalls (no log activity > 30 min, and not idle)
  if [ -f "$log_file" ]; then
    local last_mod now stale
    last_mod=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0)
    now=$(date +%s)
    stale=$(( now - last_mod ))

    if [ "$stale" -gt 1800 ]; then
      local last_line
      last_line=$(tail -1 "$log_file" 2>/dev/null)
      # Idle is fine, truly stuck is not
      if ! echo "$last_line" | grep -q "Queue empty\|Waiting\|idle"; then
        log "$name loop stale (${stale}s) — restarting"
        pkill -f "$grep_pat" 2>/dev/null
        sleep 2
        eval "$wake_cmd"
      fi
    fi

    # Check for crash loops (10+ failures in last 50 lines)
    local recent_failures
    recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true)
    if [ "$recent_failures" -ge 10 ]; then
      local errors
      errors=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5)
      file_issue "${name}-crash-loop" \
        "${name} in crash loop (${recent_failures} failures in 50 lines)" \
        "Recent errors:\n${errors}" \
        "claude"
    fi
  fi
}

check_gitea() {
  if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then
    log "Gitea UNREACHABLE"
    file_issue "gitea-down" "Gitea unreachable" \
      "Gitea at ${GITEA_URL} not responding. Check VPS 143.198.27.163." "claude"
  fi
}

check_disk() {
  local dir_count
  dir_count=$(ls -1d "$HOME/worktrees"/claude-w* 2>/dev/null | wc -l | tr -d ' ')
  if [ "${dir_count:-0}" -gt 40 ]; then
    log "WARN: $dir_count clone dirs — cleaning old ones"
    # Delete clone dirs not modified in the last hour
    find "$HOME/worktrees" -maxdepth 1 -name "claude-w*" -type d -mmin +60 -exec rm -rf {} \; 2>/dev/null
  fi
}

# === MAIN ===
log "=== Watchdog Started (PID $$) ==="

while true; do
  check_gitea

  check_loop "claude" "claude-loop.sh" \
    "nohup bash ~/.hermes/bin/claude-loop.sh 10 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \
    "$LOG_DIR/claude-loop.log"

  check_loop "kimi" "kimi-loop.sh" \
    "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \
    "$LOG_DIR/kimi-loop.log"

  # Gemini disabled — no API key configured
  # check_loop "gemini" ...

  check_disk

  sleep "$CHECK_INTERVAL"
done