#!/usr/bin/env bash # loop-watchdog.sh — Self-healing monitor for agent loops # Single instance enforced via pidfile. Checks every 2 minutes. # Restarts dead loops. Files Gitea issues for persistent failures. # Does NOT kill git processes — workers handle their own cleanup. set -uo pipefail LOG_DIR="$HOME/.hermes/logs" LOG="$LOG_DIR/watchdog.log" PIDFILE="$LOG_DIR/watchdog.pid" ISSUE_LOG="$LOG_DIR/watchdog-issues.json" GITEA_URL="http://143.198.27.163:3000" ADMIN_TOKEN=$(cat "$HOME/.config/gitea/token" 2>/dev/null) ISSUE_REPO="rockachopa/hermes-agent" CHECK_INTERVAL=120 mkdir -p "$LOG_DIR" [ -f "$ISSUE_LOG" ] || echo '{}' > "$ISSUE_LOG" # === Single instance guard === if [ -f "$PIDFILE" ]; then old_pid=$(cat "$PIDFILE") if kill -0 "$old_pid" 2>/dev/null; then echo "Watchdog already running (PID $old_pid)" >&2 exit 0 fi fi echo $$ > "$PIDFILE" trap 'rm -f "$PIDFILE"' EXIT log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] WATCHDOG: $*" >> "$LOG" } # File a Gitea issue. Deduplicates within 6 hours. file_issue() { local issue_key="$1" local title="$2" local body="$3" local assignee="${4:-claude}" local should_file should_file=$(python3 -c " import json, time try: with open('$ISSUE_LOG') as f: filed = json.load(f) except: filed = {} entry = filed.get('$issue_key', {}) if entry and entry.get('until', 0) > time.time(): print('no') else: filed['$issue_key'] = {'until': time.time() + 21600} with open('$ISSUE_LOG', 'w') as f: json.dump(filed, f, indent=2) print('yes') " 2>/dev/null) [ "$should_file" != "yes" ] && return 0 log "FILING ISSUE: $title" local tmpfile="/tmp/watchdog-issue-$$.json" python3 -c " import json, sys with open('$tmpfile', 'w') as f: json.dump({ 'title': '[watchdog] ' + sys.argv[1], 'body': sys.argv[2] + '\n\n---\n*Auto-filed by loop-watchdog*', 'assignees': [sys.argv[3]], }, f) " "$title" "$body" "$assignee" 2>/dev/null if [ -f "$tmpfile" ]; then curl -sf -X POST "${GITEA_URL}/api/v1/repos/${ISSUE_REPO}/issues" \ -H "Authorization: token ${ADMIN_TOKEN}" \ -H "Content-Type: application/json" \ -d @"$tmpfile" >/dev/null 2>&1 || log "WARN: Failed to file issue: $title" rm -f "$tmpfile" fi } # Check if a loop is alive. Restart if dead. check_loop() { local name="$1" local grep_pat="$2" local wake_cmd="$3" local log_file="$4" local pid pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) if [ -z "$pid" ]; then log "$name loop DOWN — restarting..." eval "$wake_cmd" sleep 5 pid=$(pgrep -f "$grep_pat" 2>/dev/null | head -1) if [ -z "$pid" ]; then file_issue "${name}-loop-dead" \ "${name} loop won't start" \ "The ${name} agent loop failed to start.\nCommand: ${wake_cmd}\nLast log: $(tail -10 "$log_file" 2>/dev/null)" \ "claude" else log "$name loop restarted (PID $pid)" fi return fi # Check for stalls (no log activity > 30 min, and not idle) if [ -f "$log_file" ]; then local last_mod now stale last_mod=$(stat -f %m "$log_file" 2>/dev/null || stat -c %Y "$log_file" 2>/dev/null || echo 0) now=$(date +%s) stale=$(( now - last_mod )) if [ "$stale" -gt 1800 ]; then local last_line last_line=$(tail -1 "$log_file" 2>/dev/null) # Idle is fine, truly stuck is not if ! echo "$last_line" | grep -q "Queue empty\|Waiting\|idle"; then log "$name loop stale (${stale}s) — restarting" pkill -f "$grep_pat" 2>/dev/null sleep 2 eval "$wake_cmd" fi fi # Check for crash loops (10+ failures in last 50 lines) local recent_failures recent_failures=$(tail -50 "$log_file" 2>/dev/null | grep -c "FAILED:\|ERROR:" || true) if [ "$recent_failures" -ge 10 ]; then local errors errors=$(tail -50 "$log_file" 2>/dev/null | grep "FAILED:\|ERROR:" | tail -5) file_issue "${name}-crash-loop" \ "${name} in crash loop (${recent_failures} failures in 50 lines)" \ "Recent errors:\n${errors}" \ "claude" fi fi } check_gitea() { if ! curl -sf --max-time 5 "${GITEA_URL}/api/v1/version" >/dev/null 2>&1; then log "Gitea UNREACHABLE" file_issue "gitea-down" "Gitea unreachable" \ "Gitea at ${GITEA_URL} not responding. Check VPS 143.198.27.163." "claude" fi } check_disk() { local dir_count dir_count=$(ls -1d "$HOME/worktrees"/claude-w* 2>/dev/null | wc -l | tr -d ' ') if [ "${dir_count:-0}" -gt 40 ]; then log "WARN: $dir_count clone dirs — cleaning old ones" # Delete clone dirs not modified in the last hour find "$HOME/worktrees" -maxdepth 1 -name "claude-w*" -type d -mmin +60 -exec rm -rf {} \; 2>/dev/null fi } # === MAIN === log "=== Watchdog Started (PID $$) ===" while true; do check_gitea check_loop "claude" "claude-loop.sh" \ "nohup bash ~/.hermes/bin/claude-loop.sh 10 >> ~/.hermes/logs/claude-loop.log 2>&1 &" \ "$LOG_DIR/claude-loop.log" check_loop "kimi" "kimi-loop.sh" \ "nohup bash ~/.hermes/bin/kimi-loop.sh >> ~/.hermes/logs/kimi-loop.log 2>&1 &" \ "$LOG_DIR/kimi-loop.log" # Gemini disabled — no API key configured # check_loop "gemini" ... check_disk sleep "$CHECK_INTERVAL" done