#!/usr/bin/env bash # pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart them # # Tracks output hash per pane across cycles. If a pane's captured output # hasn't changed for STUCK_CYCLES consecutive checks, the pane is STUCK. # Dead panes (PID gone) are also detected. # # On STUCK/DEAD: # 1. Kill the pane # 2. Attempt restart with --resume (session ID from manifest) # 3. Fallback: fresh prompt with last known task from logs # # State file: ~/.hermes/pane-state.json # Log: ~/.hermes/logs/pane-watchdog.log # # Usage: # pane-watchdog.sh # One-shot check all sessions # pane-watchdog.sh --daemon # Run every CHECK_INTERVAL seconds # pane-watchdog.sh --status # Print current pane state # pane-watchdog.sh --session NAME # Check only one session # # Issue: timmy-config #515 set -uo pipefail export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH" # === CONFIG === STATE_FILE="${PANE_STATE_FILE:-$HOME/.hermes/pane-state.json}" LOG_FILE="${PANE_WATCHDOG_LOG:-$HOME/.hermes/logs/pane-watchdog.log}" CHECK_INTERVAL="${PANE_CHECK_INTERVAL:-120}" # seconds between cycles STUCK_CYCLES=2 # unchanged cycles before STUCK MAX_RESTART_ATTEMPTS=3 # per pane per hour RESTART_COOLDOWN=3600 # seconds between escalation alerts CAPTURE_LINES=40 # lines of output to hash # Sessions to monitor (all if empty) MONITOR_SESSIONS="${PANE_WATCHDOG_SESSIONS:-}" mkdir -p "$(dirname "$STATE_FILE")" "$(dirname "$LOG_FILE")" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" } # === HELPERS === # Capture last N lines of pane output and hash them capture_pane_hash() { local target="$1" local output output=$(tmux capture-pane -t "$target" -p -S "-${CAPTURE_LINES}" 2>/dev/null || echo "DEAD") echo -n "$output" | shasum -a 256 | cut -d' ' -f1 } # Check if pane PID is alive pane_pid_alive() { local target="$1" local pid pid=$(tmux list-panes -t "$target" -F '#{pane_pid}' 2>/dev/null | head -1 || echo "") if [ -z "$pid" ]; then return 1 # pane doesn't exist fi kill -0 "$pid" 2>/dev/null } # Get pane start command pane_start_command() { local target="$1" tmux list-panes -t "$target" -F '#{pane_start_command}' 2>/dev/null | head -1 || echo "unknown" } # Get the pane's current running command (child process) pane_current_command() { local target="$1" tmux list-panes -t "$target" -F '#{pane_current_command}' 2>/dev/null || echo "unknown" } # Only restart panes running hermes/agent commands (not zsh, python3 repls, etc.) is_restartable() { local cmd="$1" case "$cmd" in hermes|*hermes*|*agent*|*timmy*|*kimi*|*claude-loop*|*gemini-loop*) return 0 ;; *) return 1 ;; esac } # Get session ID from hermes manifest if available get_hermes_session_id() { local session_name="$1" local manifest="$HOME/.hermes/sessions/${session_name}/manifest.json" if [ -f "$manifest" ]; then python3 -c " import json, sys try: m = json.load(open('$manifest')) print(m.get('session_id', m.get('id', ''))) except: pass " 2>/dev/null || echo "" else echo "" fi } # Get last task from pane logs get_last_task() { local session_name="$1" local log_dir="$HOME/.hermes/logs" # Find the most recent log for this session local log_file log_file=$(find "$log_dir" -name "*${session_name}*" -type f -mtime -1 2>/dev/null | sort -r | head -1) if [ -n "$log_file" ] && [ -f "$log_file" ]; then # Extract last user prompt or task description grep -i "task:\|prompt:\|issue\|working on" "$log_file" 2>/dev/null | tail -1 | sed 's/.*[:>] *//' | head -c 200 fi } # Restart a pane with a fresh shell/command restart_pane() { local target="$1" local session_name="${target%%:*}" local session_id last_task cmd log "RESTART: Attempting to restart $target" # Kill existing pane tmux kill-pane -t "$target" 2>/dev/null || true sleep 1 # Try --resume with session ID session_id=$(get_hermes_session_id "$session_name") if [ -n "$session_id" ]; then log "RESTART: Trying --resume with session $session_id" tmux split-window -t "$session_name" -d \ "hermes chat --resume '$session_id' 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'" sleep 2 if pane_pid_alive "${session_name}:1" 2>/dev/null; then log "RESTART: Success with --resume" return 0 fi fi # Fallback: fresh prompt last_task=$(get_last_task "$session_name") if [ -n "$last_task" ]; then log "RESTART: Fallback — fresh prompt with task: $last_task" tmux split-window -t "$session_name" -d \ "echo 'Watchdog restart — last task: $last_task' && hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'" else log "RESTART: Fallback — fresh hermes chat" tmux split-window -t "$session_name" -d \ "hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'" fi sleep 2 if pane_pid_alive "${session_name}:1" 2>/dev/null; then log "RESTART: Fallback restart succeeded" return 0 else log "RESTART: FAILED to restart $target" return 1 fi } # === STATE MANAGEMENT === read_state() { if [ -f "$STATE_FILE" ]; then cat "$STATE_FILE" else echo "{}" fi } write_state() { echo "$1" > "$STATE_FILE" } # Update state for a single pane and return JSON status update_pane_state() { local target="$1" local hash="$2" local is_alive="$3" local now now=$(date +%s) python3 - "$STATE_FILE" "$target" "$hash" "$is_alive" "$now" "$STUCK_CYCLES" <<'PYEOF' import json, sys, time state_file = sys.argv[1] target = sys.argv[2] new_hash = sys.argv[3] is_alive = sys.argv[4] == "true" now = int(sys.argv[5]) stuck_cycles = int(sys.argv[6]) try: with open(state_file) as f: state = json.load(f) except (FileNotFoundError, json.JSONDecodeError): state = {} pane = state.get(target, { "hash": "", "same_count": 0, "status": "UNKNOWN", "last_change": 0, "last_check": 0, "restart_attempts": 0, "last_restart": 0, "current_command": "", }) if not is_alive: pane["status"] = "DEAD" pane["same_count"] = 0 elif new_hash == pane.get("hash", ""): pane["same_count"] = pane.get("same_count", 0) + 1 if pane["same_count"] >= stuck_cycles: pane["status"] = "STUCK" else: pane["status"] = "STALE" if pane["same_count"] > 0 else "OK" else: pane["hash"] = new_hash pane["same_count"] = 0 pane["status"] = "OK" pane["last_change"] = now pane["last_check"] = now state[target] = pane with open(state_file, "w") as f: json.dump(state, f, indent=2) print(json.dumps(pane)) PYEOF } # Reset restart attempt counter if cooldown expired maybe_reset_restarts() { local target="$1" local now now=$(date +%s) python3 - "$STATE_FILE" "$target" "$now" "$RESTART_COOLDOWN" <<'PYEOF' import json, sys state_file = sys.argv[1] target = sys.argv[2] now = int(sys.argv[3]) cooldown = int(sys.argv[4]) with open(state_file) as f: state = json.load(f) pane = state.get(target, {}) last_restart = pane.get("last_restart", 0) if now - last_restart > cooldown: pane["restart_attempts"] = 0 state[target] = pane with open(state_file, "w") as f: json.dump(state, f, indent=2) print(pane.get("restart_attempts", 0)) PYEOF } increment_restart_attempt() { local target="$1" local now now=$(date +%s) python3 - "$STATE_FILE" "$target" "$now" <<'PYEOF' import json, sys state_file = sys.argv[1] target = sys.argv[2] now = int(sys.argv[3]) with open(state_file) as f: state = json.load(f) pane = state.get(target, {}) pane["restart_attempts"] = pane.get("restart_attempts", 0) + 1 pane["last_restart"] = now pane["status"] = "RESTARTING" state[target] = pane with open(state_file, "w") as f: json.dump(state, f, indent=2) print(pane["restart_attempts"]) PYEOF } # === CORE CHECK === check_pane() { local target="$1" local hash is_alive status current_cmd # Capture state hash=$(capture_pane_hash "$target") if pane_pid_alive "$target"; then is_alive="true" else is_alive="false" fi # Get current command for the pane current_cmd=$(pane_current_command "$target") # Update state and get result local result result=$(update_pane_state "$target" "$hash" "$is_alive") status=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('status','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN") case "$status" in OK) # Healthy, do nothing ;; DEAD) log "DETECTED: $target is DEAD (PID gone) cmd=$current_cmd" if is_restartable "$current_cmd"; then handle_stuck "$target" else log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting" fi ;; STUCK) log "DETECTED: $target is STUCK (output unchanged for ${STUCK_CYCLES} cycles) cmd=$current_cmd" if is_restartable "$current_cmd"; then handle_stuck "$target" else log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting" fi ;; STALE) # Output unchanged but within threshold — just log local count count=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('same_count',0))" 2>/dev/null || echo "?") log "STALE: $target unchanged for $count cycle(s)" ;; esac } handle_stuck() { local target="$1" local session_name="${target%%:*}" local attempts # Check restart budget attempts=$(maybe_reset_restarts "$target") if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then log "ESCALATION: $target stuck ${attempts}x — manual intervention needed" echo "ALERT: $target stuck after $attempts restart attempts" >&2 return 1 fi attempts=$(increment_restart_attempt "$target") log "ACTION: Restarting $target (attempt $attempts/$MAX_RESTART_ATTEMPTS)" if restart_pane "$target"; then log "OK: $target restarted successfully" else log "FAIL: $target restart failed (attempt $attempts)" fi } check_all_sessions() { local sessions if [ -n "$MONITOR_SESSIONS" ]; then IFS=',' read -ra sessions <<< "$MONITOR_SESSIONS" else sessions=() while IFS= read -r line; do [ -n "$line" ] && sessions+=("$line") done < <(tmux list-sessions -F '#{session_name}' 2>/dev/null || true) fi local total=0 stuck=0 dead=0 ok=0 for session in "${sessions[@]}"; do [ -z "$session" ] && continue # Get pane targets local panes panes=$(tmux list-panes -t "$session" -F "${session}:#{window_index}.#{pane_index}" 2>/dev/null || true) for target in $panes; do check_pane "$target" total=$((total + 1)) done done log "CHECK: Processed $total panes" } # === STATUS DISPLAY === show_status() { if [ ! -f "$STATE_FILE" ]; then echo "No pane state file found at $STATE_FILE" echo "Run pane-watchdog.sh once to initialize." exit 0 fi python3 - "$STATE_FILE" <<'PYEOF' import json, sys, time state_file = sys.argv[1] try: with open(state_file) as f: state = json.load(f) except (FileNotFoundError, json.JSONDecodeError): print("No state data yet.") sys.exit(0) if not state: print("No panes tracked.") sys.exit(0) now = int(time.time()) print(f"{'PANE':<35} {'STATUS':<12} {'STALE':<6} {'LAST CHANGE':<15} {'RESTARTS'}") print("-" * 90) for target in sorted(state.keys()): p = state[target] status = p.get("status", "?") same = p.get("same_count", 0) last_change = p.get("last_change", 0) restarts = p.get("restart_attempts", 0) if last_change: ago = now - last_change if ago < 60: change_str = f"{ago}s ago" elif ago < 3600: change_str = f"{ago//60}m ago" else: change_str = f"{ago//3600}h ago" else: change_str = "never" # Color code if status == "OK": icon = "✓" elif status == "STUCK": icon = "✖" elif status == "DEAD": icon = "☠" elif status == "STALE": icon = "⏳" else: icon = "?" print(f" {icon} {target:<32} {status:<12} {same:<6} {change_str:<15} {restarts}") PYEOF } # === DAEMON MODE === run_daemon() { log "DAEMON: Starting (interval=${CHECK_INTERVAL}s, stuck_threshold=${STUCK_CYCLES})" echo "Pane watchdog started. Checking every ${CHECK_INTERVAL}s. Ctrl+C to stop." echo "Log: $LOG_FILE" echo "State: $STATE_FILE" echo "" while true; do check_all_sessions sleep "$CHECK_INTERVAL" done } # === MAIN === case "${1:-}" in --daemon) run_daemon ;; --status) show_status ;; --session) if [ -z "${2:-}" ]; then echo "Usage: pane-watchdog.sh --session SESSION_NAME" exit 1 fi MONITOR_SESSIONS="$2" check_all_sessions ;; --help|-h) echo "pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart" echo "" echo "Usage:" echo " pane-watchdog.sh # One-shot check" echo " pane-watchdog.sh --daemon # Continuous monitoring" echo " pane-watchdog.sh --status # Show pane state" echo " pane-watchdog.sh --session S # Check one session" echo "" echo "Config (env vars):" echo " PANE_CHECK_INTERVAL Seconds between checks (default: 120)" echo " PANE_WATCHDOG_SESSIONS Comma-separated session names" echo " PANE_STATE_FILE State file path" echo " STUCK_CYCLES Unchanged cycles before STUCK (default: 2)" ;; *) check_all_sessions ;; esac