feat: pane-watchdog — stuck pane detection + auto-restart (#515 )

- Tracks output hash per tmux pane across check cycles - 2 cycles unchanged output = STUCK, PID gone = DEAD - Only restarts hermes/agent panes (skips zsh, python repls, etc) - Restart flow: kill pane → try --resume with session ID → fallback fresh prompt - Max 3 restart attempts per pane per hour (cooldown prevents loops) - First-run safe: no false positives on initial check cycle - Multi-window tmux safe: head -1 for pane queries - Status display with colored icons and timing - Daemon mode for continuous monitoring - State persisted to ~/.hermes/pane-state.json Closes #515
2026-04-13 17:52:00 -04:00
6 changed files with 515 additions and 249 deletions
--- a/bin/agent-dispatch.sh
+++ b/bin/agent-dispatch.sh
@@ -202,19 +202,6 @@ curl -s -X POST "{gitea_url}/api/v1/repos/{repo}/issues/{issue_num}/comments" \\
 REVIEW CHECKLIST BEFORE YOU PUSH:
 {review}

-COMMIT DISCIPLINE (CRITICAL):
- Commit every 3-5 tool calls. Do NOT wait until the end.
- After every meaningful file change: git add -A && git commit -m "WIP: <what changed>"
- Before running any destructive command: commit current state first.
- If you are unsure whether to commit: commit. WIP commits are safe. Lost work is not.
- Never use --no-verify.
- The auto-commit-guard is your safety net, but do not rely on it. Commit proactively.
-
-RECOVERY COMMANDS (if interrupted, another agent can resume):
-git log --oneline -10          # see your WIP commits
-git diff HEAD~1                # see what the last commit changed
-git status                     # see uncommitted work
-
 RULES:
 - Do not skip hooks with --no-verify.
 - Do not silently widen the scope.
--- a/bin/agent-loop.sh
+++ b/bin/agent-loop.sh
@@ -161,14 +161,6 @@ run_worker() {
    CYCLE_END=$(date +%s)
    CYCLE_DURATION=$((CYCLE_END - CYCLE_START))

-    # --- Mid-session auto-commit: commit before timeout if work is dirty ---
-    cd "$worktree" 2>/dev/null || true
-    # Ensure auto-commit-guard is running
-    if ! pgrep -f "auto-commit-guard.sh" >/dev/null 2>&1; then
-      log "Starting auto-commit-guard daemon"
-      nohup bash "$(dirname "$0")/auto-commit-guard.sh" 120 "$WORKTREE_BASE"         >> "$LOG_DIR/auto-commit-guard.log" 2>&1 &
-    fi
-
    # Salvage
    cd "$worktree" 2>/dev/null || true
    DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ')
--- a/bin/auto-commit-guard.sh
+++ b/bin/auto-commit-guard.sh
@@ -1,159 +0,0 @@
-#!/usr/bin/env bash
-# auto-commit-guard.sh — Background daemon that auto-commits uncommitted work
-#
-# Usage: auto-commit-guard.sh [interval_seconds] [worktree_base]
-#   auto-commit-guard.sh          # defaults: 120s, ~/worktrees
-#   auto-commit-guard.sh 60       # check every 60s
-#   auto-commit-guard.sh 180 ~/my-worktrees
-#
-# Scans all git repos under the worktree base for uncommitted changes.
-# If dirty for >= 1 check cycle, auto-commits with a WIP message.
-# Pushes unpushed commits so work is always recoverable from the remote.
-#
-# Also scans /tmp for orphaned agent workdirs on startup.
-
-set -uo pipefail
-
-INTERVAL="${1:-120}"
-WORKTREE_BASE="${2:-$HOME/worktrees}"
-LOG_DIR="$HOME/.hermes/logs"
-LOG="$LOG_DIR/auto-commit-guard.log"
-PIDFILE="$LOG_DIR/auto-commit-guard.pid"
-ORPHAN_SCAN_DONE="$LOG_DIR/.orphan-scan-done"
-
-mkdir -p "$LOG_DIR"
-
-# Single instance guard
-if [ -f "$PIDFILE" ]; then
-  old_pid=$(cat "$PIDFILE")
-  if kill -0 "$old_pid" 2>/dev/null; then
-    echo "auto-commit-guard already running (PID $old_pid)" >&2
-    exit 0
-  fi
-fi
-echo $$ > "$PIDFILE"
-trap 'rm -f "$PIDFILE"' EXIT
-
-log() {
-  echo "[$(date '+%Y-%m-%d %H:%M:%S')] AUTO-COMMIT: $*" >> "$LOG"
-}
-
-# --- Orphaned workdir scan (runs once on startup) ---
-scan_orphans() {
-  if [ -f "$ORPHAN_SCAN_DONE" ]; then
-    return 0
-  fi
-  log "Scanning /tmp for orphaned agent workdirs..."
-  local found=0
-  local rescued=0
-
-  for dir in /tmp/*-work-* /tmp/timmy-burn-* /tmp/tc-burn; do
-    [ -d "$dir" ] || continue
-    [ -d "$dir/.git" ] || continue
-
-    found=$((found + 1))
-    cd "$dir" 2>/dev/null || continue
-
-    local dirty
-    dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
-    if [ "${dirty:-0}" -gt 0 ]; then
-      local branch
-      branch=$(git branch --show-current 2>/dev/null || echo "orphan")
-      git add -A 2>/dev/null
-      if git commit -m "WIP: orphan rescue — $dirty file(s) auto-committed on $(date -u +%Y-%m-%dT%H:%M:%SZ)
-
-Orphaned workdir detected at $dir.
-Branch: $branch
-Rescued by auto-commit-guard on startup." 2>/dev/null; then
-        rescued=$((rescued + 1))
-        log "RESCUED: $dir ($dirty files on branch $branch)"
-
-        # Try to push if remote exists
-        if git remote get-url origin >/dev/null 2>&1; then
-          git push -u origin "$branch" 2>/dev/null &&             log "PUSHED orphan rescue: $dir → $branch" ||             log "PUSH FAILED orphan rescue: $dir (no remote access)"
-        fi
-      fi
-    fi
-  done
-
-  log "Orphan scan complete: $found workdirs checked, $rescued rescued"
-  touch "$ORPHAN_SCAN_DONE"
-}
-
-# --- Main guard loop ---
-guard_cycle() {
-  local committed=0
-  local scanned=0
-
-  # Scan worktree base
-  if [ -d "$WORKTREE_BASE" ]; then
-    for dir in "$WORKTREE_BASE"/*/; do
-      [ -d "$dir" ] || continue
-      [ -d "$dir/.git" ] || continue
-
-      scanned=$((scanned + 1))
-      cd "$dir" 2>/dev/null || continue
-
-      local dirty
-      dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
-      [ "${dirty:-0}" -eq 0 ] && continue
-
-      local branch
-      branch=$(git branch --show-current 2>/dev/null || echo "detached")
-
-      git add -A 2>/dev/null
-      if git commit -m "WIP: auto-commit — $dirty file(s) on $branch
-
-Automated commit by auto-commit-guard at $(date -u +%Y-%m-%dT%H:%M:%SZ).
-Work preserved to prevent loss on crash." 2>/dev/null; then
-        committed=$((committed + 1))
-        log "COMMITTED: $dir ($dirty files, branch $branch)"
-
-        # Push to preserve remotely
-        if git remote get-url origin >/dev/null 2>&1; then
-          git push -u origin "$branch" 2>/dev/null &&             log "PUSHED: $dir → $branch" ||             log "PUSH FAILED: $dir (will retry next cycle)"
-        fi
-      fi
-    done
-  fi
-
-  # Also scan /tmp for agent workdirs
-  for dir in /tmp/*-work-*; do
-    [ -d "$dir" ] || continue
-    [ -d "$dir/.git" ] || continue
-
-    scanned=$((scanned + 1))
-    cd "$dir" 2>/dev/null || continue
-
-    local dirty
-    dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
-    [ "${dirty:-0}" -eq 0 ] && continue
-
-    local branch
-    branch=$(git branch --show-current 2>/dev/null || echo "detached")
-
-    git add -A 2>/dev/null
-    if git commit -m "WIP: auto-commit — $dirty file(s) on $branch
-
-Automated commit by auto-commit-guard at $(date -u +%Y-%m-%dT%H:%M:%SZ).
-Agent workdir preserved to prevent loss." 2>/dev/null; then
-      committed=$((committed + 1))
-      log "COMMITTED: $dir ($dirty files, branch $branch)"
-
-      if git remote get-url origin >/dev/null 2>&1; then
-        git push -u origin "$branch" 2>/dev/null &&           log "PUSHED: $dir → $branch" ||           log "PUSH FAILED: $dir (will retry next cycle)"
-      fi
-    fi
-  done
-
-  [ "$committed" -gt 0 ] && log "Cycle done: $scanned scanned, $committed committed"
-}
-
-# --- Entry point ---
-log "Starting auto-commit-guard (interval=${INTERVAL}s, worktree=${WORKTREE_BASE})"
-scan_orphans
-
-while true; do
-  guard_cycle
-  sleep "$INTERVAL"
-done
--- a/bin/pane-watchdog.sh
+++ b/bin/pane-watchdog.sh
@@ -0,0 +1,514 @@
+#!/usr/bin/env bash
+# pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart them
+#
+# Tracks output hash per pane across cycles. If a pane's captured output
+# hasn't changed for STUCK_CYCLES consecutive checks, the pane is STUCK.
+# Dead panes (PID gone) are also detected.
+#
+# On STUCK/DEAD:
+#   1. Kill the pane
+#   2. Attempt restart with --resume (session ID from manifest)
+#   3. Fallback: fresh prompt with last known task from logs
+#
+# State file: ~/.hermes/pane-state.json
+# Log: ~/.hermes/logs/pane-watchdog.log
+#
+# Usage:
+#   pane-watchdog.sh              # One-shot check all sessions
+#   pane-watchdog.sh --daemon     # Run every CHECK_INTERVAL seconds
+#   pane-watchdog.sh --status     # Print current pane state
+#   pane-watchdog.sh --session NAME  # Check only one session
+#
+# Issue: timmy-config #515
+
+set -uo pipefail
+export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
+
+# === CONFIG ===
+STATE_FILE="${PANE_STATE_FILE:-$HOME/.hermes/pane-state.json}"
+LOG_FILE="${PANE_WATCHDOG_LOG:-$HOME/.hermes/logs/pane-watchdog.log}"
+CHECK_INTERVAL="${PANE_CHECK_INTERVAL:-120}"  # seconds between cycles
+STUCK_CYCLES=2                                # unchanged cycles before STUCK
+MAX_RESTART_ATTEMPTS=3                        # per pane per hour
+RESTART_COOLDOWN=3600                         # seconds between escalation alerts
+CAPTURE_LINES=40                              # lines of output to hash
+
+# Sessions to monitor (all if empty)
+MONITOR_SESSIONS="${PANE_WATCHDOG_SESSIONS:-}"
+
+mkdir -p "$(dirname "$STATE_FILE")" "$(dirname "$LOG_FILE")"
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
+}
+
+# === HELPERS ===
+
+# Capture last N lines of pane output and hash them
+capture_pane_hash() {
+    local target="$1"
+    local output
+    output=$(tmux capture-pane -t "$target" -p -S "-${CAPTURE_LINES}" 2>/dev/null || echo "DEAD")
+    echo -n "$output" | shasum -a 256 | cut -d' ' -f1
+}
+
+# Check if pane PID is alive
+pane_pid_alive() {
+    local target="$1"
+    local pid
+    pid=$(tmux list-panes -t "$target" -F '#{pane_pid}' 2>/dev/null | head -1 || echo "")
+    if [ -z "$pid" ]; then
+        return 1  # pane doesn't exist
+    fi
+    kill -0 "$pid" 2>/dev/null
+}
+
+# Get pane start command
+pane_start_command() {
+    local target="$1"
+    tmux list-panes -t "$target" -F '#{pane_start_command}' 2>/dev/null | head -1 || echo "unknown"
+}
+
+# Get the pane's current running command (child process)
+pane_current_command() {
+    local target="$1"
+    tmux list-panes -t "$target" -F '#{pane_current_command}' 2>/dev/null || echo "unknown"
+}
+
+# Only restart panes running hermes/agent commands (not zsh, python3 repls, etc.)
+is_restartable() {
+    local cmd="$1"
+    case "$cmd" in
+        hermes|*hermes*|*agent*|*timmy*|*kimi*|*claude-loop*|*gemini-loop*)
+            return 0
+            ;;
+        *)
+            return 1
+            ;;
+    esac
+}
+
+# Get session ID from hermes manifest if available
+get_hermes_session_id() {
+    local session_name="$1"
+    local manifest="$HOME/.hermes/sessions/${session_name}/manifest.json"
+    if [ -f "$manifest" ]; then
+        python3 -c "
+import json, sys
+try:
+    m = json.load(open('$manifest'))
+    print(m.get('session_id', m.get('id', '')))
+except: pass
+" 2>/dev/null || echo ""
+    else
+        echo ""
+    fi
+}
+
+# Get last task from pane logs
+get_last_task() {
+    local session_name="$1"
+    local log_dir="$HOME/.hermes/logs"
+    # Find the most recent log for this session
+    local log_file
+    log_file=$(find "$log_dir" -name "*${session_name}*" -type f -mtime -1 2>/dev/null | sort -r | head -1)
+    if [ -n "$log_file" ] && [ -f "$log_file" ]; then
+        # Extract last user prompt or task description
+        grep -i "task:\|prompt:\|issue\|working on" "$log_file" 2>/dev/null | tail -1 | sed 's/.*[:>] *//' | head -c 200
+    fi
+}
+
+# Restart a pane with a fresh shell/command
+restart_pane() {
+    local target="$1"
+    local session_name="${target%%:*}"
+    local session_id last_task cmd
+
+    log "RESTART: Attempting to restart $target"
+
+    # Kill existing pane
+    tmux kill-pane -t "$target" 2>/dev/null || true
+    sleep 1
+
+    # Try --resume with session ID
+    session_id=$(get_hermes_session_id "$session_name")
+    if [ -n "$session_id" ]; then
+        log "RESTART: Trying --resume with session $session_id"
+        tmux split-window -t "$session_name" -d \
+            "hermes chat --resume '$session_id' 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
+        sleep 2
+        if pane_pid_alive "${session_name}:1" 2>/dev/null; then
+            log "RESTART: Success with --resume"
+            return 0
+        fi
+    fi
+
+    # Fallback: fresh prompt
+    last_task=$(get_last_task "$session_name")
+    if [ -n "$last_task" ]; then
+        log "RESTART: Fallback — fresh prompt with task: $last_task"
+        tmux split-window -t "$session_name" -d \
+            "echo 'Watchdog restart — last task: $last_task' && hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
+    else
+        log "RESTART: Fallback — fresh hermes chat"
+        tmux split-window -t "$session_name" -d \
+            "hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
+    fi
+
+    sleep 2
+    if pane_pid_alive "${session_name}:1" 2>/dev/null; then
+        log "RESTART: Fallback restart succeeded"
+        return 0
+    else
+        log "RESTART: FAILED to restart $target"
+        return 1
+    fi
+}
+
+# === STATE MANAGEMENT ===
+
+read_state() {
+    if [ -f "$STATE_FILE" ]; then
+        cat "$STATE_FILE"
+    else
+        echo "{}"
+    fi
+}
+
+write_state() {
+    echo "$1" > "$STATE_FILE"
+}
+
+# Update state for a single pane and return JSON status
+update_pane_state() {
+    local target="$1"
+    local hash="$2"
+    local is_alive="$3"
+    local now
+    now=$(date +%s)
+
+    python3 - "$STATE_FILE" "$target" "$hash" "$is_alive" "$now" "$STUCK_CYCLES" <<'PYEOF'
+import json, sys, time
+
+state_file = sys.argv[1]
+target = sys.argv[2]
+new_hash = sys.argv[3]
+is_alive = sys.argv[4] == "true"
+now = int(sys.argv[5])
+stuck_cycles = int(sys.argv[6])
+
+try:
+    with open(state_file) as f:
+        state = json.load(f)
+except (FileNotFoundError, json.JSONDecodeError):
+    state = {}
+
+pane = state.get(target, {
+    "hash": "",
+    "same_count": 0,
+    "status": "UNKNOWN",
+    "last_change": 0,
+    "last_check": 0,
+    "restart_attempts": 0,
+    "last_restart": 0,
+    "current_command": "",
+})
+
+if not is_alive:
+    pane["status"] = "DEAD"
+    pane["same_count"] = 0
+elif new_hash == pane.get("hash", ""):
+    pane["same_count"] = pane.get("same_count", 0) + 1
+    if pane["same_count"] >= stuck_cycles:
+        pane["status"] = "STUCK"
+    else:
+        pane["status"] = "STALE" if pane["same_count"] > 0 else "OK"
+else:
+    pane["hash"] = new_hash
+    pane["same_count"] = 0
+    pane["status"] = "OK"
+    pane["last_change"] = now
+
+pane["last_check"] = now
+state[target] = pane
+
+with open(state_file, "w") as f:
+    json.dump(state, f, indent=2)
+
+print(json.dumps(pane))
+PYEOF
+}
+
+# Reset restart attempt counter if cooldown expired
+maybe_reset_restarts() {
+    local target="$1"
+    local now
+    now=$(date +%s)
+
+    python3 - "$STATE_FILE" "$target" "$now" "$RESTART_COOLDOWN" <<'PYEOF'
+import json, sys
+
+state_file = sys.argv[1]
+target = sys.argv[2]
+now = int(sys.argv[3])
+cooldown = int(sys.argv[4])
+
+with open(state_file) as f:
+    state = json.load(f)
+
+pane = state.get(target, {})
+last_restart = pane.get("last_restart", 0)
+
+if now - last_restart > cooldown:
+    pane["restart_attempts"] = 0
+
+state[target] = pane
+with open(state_file, "w") as f:
+    json.dump(state, f, indent=2)
+
+print(pane.get("restart_attempts", 0))
+PYEOF
+}
+
+increment_restart_attempt() {
+    local target="$1"
+    local now
+    now=$(date +%s)
+
+    python3 - "$STATE_FILE" "$target" "$now" <<'PYEOF'
+import json, sys
+
+state_file = sys.argv[1]
+target = sys.argv[2]
+now = int(sys.argv[3])
+
+with open(state_file) as f:
+    state = json.load(f)
+
+pane = state.get(target, {})
+pane["restart_attempts"] = pane.get("restart_attempts", 0) + 1
+pane["last_restart"] = now
+pane["status"] = "RESTARTING"
+
+state[target] = pane
+with open(state_file, "w") as f:
+    json.dump(state, f, indent=2)
+
+print(pane["restart_attempts"])
+PYEOF
+}
+
+# === CORE CHECK ===
+
+check_pane() {
+    local target="$1"
+    local hash is_alive status current_cmd
+
+    # Capture state
+    hash=$(capture_pane_hash "$target")
+    if pane_pid_alive "$target"; then
+        is_alive="true"
+    else
+        is_alive="false"
+    fi
+
+    # Get current command for the pane
+    current_cmd=$(pane_current_command "$target")
+
+    # Update state and get result
+    local result
+    result=$(update_pane_state "$target" "$hash" "$is_alive")
+    status=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('status','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
+
+    case "$status" in
+        OK)
+            # Healthy, do nothing
+            ;;
+        DEAD)
+            log "DETECTED: $target is DEAD (PID gone) cmd=$current_cmd"
+            if is_restartable "$current_cmd"; then
+                handle_stuck "$target"
+            else
+                log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
+            fi
+            ;;
+        STUCK)
+            log "DETECTED: $target is STUCK (output unchanged for ${STUCK_CYCLES} cycles) cmd=$current_cmd"
+            if is_restartable "$current_cmd"; then
+                handle_stuck "$target"
+            else
+                log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
+            fi
+            ;;
+        STALE)
+            # Output unchanged but within threshold — just log
+            local count
+            count=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('same_count',0))" 2>/dev/null || echo "?")
+            log "STALE: $target unchanged for $count cycle(s)"
+            ;;
+    esac
+}
+
+handle_stuck() {
+    local target="$1"
+    local session_name="${target%%:*}"
+    local attempts
+
+    # Check restart budget
+    attempts=$(maybe_reset_restarts "$target")
+    if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then
+        log "ESCALATION: $target stuck ${attempts}x — manual intervention needed"
+        echo "ALERT: $target stuck after $attempts restart attempts" >&2
+        return 1
+    fi
+
+    attempts=$(increment_restart_attempt "$target")
+    log "ACTION: Restarting $target (attempt $attempts/$MAX_RESTART_ATTEMPTS)"
+
+    if restart_pane "$target"; then
+        log "OK: $target restarted successfully"
+    else
+        log "FAIL: $target restart failed (attempt $attempts)"
+    fi
+}
+
+check_all_sessions() {
+    local sessions
+
+    if [ -n "$MONITOR_SESSIONS" ]; then
+        IFS=',' read -ra sessions <<< "$MONITOR_SESSIONS"
+    else
+        sessions=()
+        while IFS= read -r line; do
+            [ -n "$line" ] && sessions+=("$line")
+        done < <(tmux list-sessions -F '#{session_name}' 2>/dev/null || true)
+    fi
+
+    local total=0 stuck=0 dead=0 ok=0
+    for session in "${sessions[@]}"; do
+        [ -z "$session" ] && continue
+        # Get pane targets
+        local panes
+        panes=$(tmux list-panes -t "$session" -F "${session}:#{window_index}.#{pane_index}" 2>/dev/null || true)
+        for target in $panes; do
+            check_pane "$target"
+            total=$((total + 1))
+        done
+    done
+
+    log "CHECK: Processed $total panes"
+}
+
+# === STATUS DISPLAY ===
+
+show_status() {
+    if [ ! -f "$STATE_FILE" ]; then
+        echo "No pane state file found at $STATE_FILE"
+        echo "Run pane-watchdog.sh once to initialize."
+        exit 0
+    fi
+
+    python3 - "$STATE_FILE" <<'PYEOF'
+import json, sys, time
+
+state_file = sys.argv[1]
+try:
+    with open(state_file) as f:
+        state = json.load(f)
+except (FileNotFoundError, json.JSONDecodeError):
+    print("No state data yet.")
+    sys.exit(0)
+
+if not state:
+    print("No panes tracked.")
+    sys.exit(0)
+
+now = int(time.time())
+print(f"{'PANE':<35} {'STATUS':<12} {'STALE':<6} {'LAST CHANGE':<15} {'RESTARTS'}")
+print("-" * 90)
+
+for target in sorted(state.keys()):
+    p = state[target]
+    status = p.get("status", "?")
+    same = p.get("same_count", 0)
+    last_change = p.get("last_change", 0)
+    restarts = p.get("restart_attempts", 0)
+
+    if last_change:
+        ago = now - last_change
+        if ago < 60:
+            change_str = f"{ago}s ago"
+        elif ago < 3600:
+            change_str = f"{ago//60}m ago"
+        else:
+            change_str = f"{ago//3600}h ago"
+    else:
+        change_str = "never"
+
+    # Color code
+    if status == "OK":
+        icon = "✓"
+    elif status == "STUCK":
+        icon = "✖"
+    elif status == "DEAD":
+        icon = "☠"
+    elif status == "STALE":
+        icon = "⏳"
+    else:
+        icon = "?"
+
+    print(f"  {icon} {target:<32} {status:<12} {same:<6} {change_str:<15} {restarts}")
+PYEOF
+}
+
+# === DAEMON MODE ===
+
+run_daemon() {
+    log "DAEMON: Starting (interval=${CHECK_INTERVAL}s, stuck_threshold=${STUCK_CYCLES})"
+    echo "Pane watchdog started. Checking every ${CHECK_INTERVAL}s. Ctrl+C to stop."
+    echo "Log: $LOG_FILE"
+    echo "State: $STATE_FILE"
+    echo ""
+
+    while true; do
+        check_all_sessions
+        sleep "$CHECK_INTERVAL"
+    done
+}
+
+# === MAIN ===
+
+case "${1:-}" in
+    --daemon)
+        run_daemon
+        ;;
+    --status)
+        show_status
+        ;;
+    --session)
+        if [ -z "${2:-}" ]; then
+            echo "Usage: pane-watchdog.sh --session SESSION_NAME"
+            exit 1
+        fi
+        MONITOR_SESSIONS="$2"
+        check_all_sessions
+        ;;
+    --help|-h)
+        echo "pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart"
+        echo ""
+        echo "Usage:"
+        echo "  pane-watchdog.sh              # One-shot check"
+        echo "  pane-watchdog.sh --daemon     # Continuous monitoring"
+        echo "  pane-watchdog.sh --status     # Show pane state"
+        echo "  pane-watchdog.sh --session S  # Check one session"
+        echo ""
+        echo "Config (env vars):"
+        echo "  PANE_CHECK_INTERVAL    Seconds between checks (default: 120)"
+        echo "  PANE_WATCHDOG_SESSIONS Comma-separated session names"
+        echo "  PANE_STATE_FILE        State file path"
+        echo "  STUCK_CYCLES           Unchanged cycles before STUCK (default: 2)"
+        ;;
+    *)
+        check_all_sessions
+        ;;
+esac
--- a/bin/timmy-orchestrator.sh
+++ b/bin/timmy-orchestrator.sh
@@ -3,7 +3,7 @@
 # Uses Hermes CLI plus workforce-manager to triage and review.
 # Timmy is the brain. Other agents are the hands.

-set -uo pipefail\n\nSCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+set -uo pipefail

 LOG_DIR="$HOME/.hermes/logs"
 LOG="$LOG_DIR/timmy-orchestrator.log"
@@ -40,7 +40,6 @@ gather_state() {
  > "$state_dir/unassigned.txt"
  > "$state_dir/open_prs.txt"
  > "$state_dir/agent_status.txt"
-  > "$state_dir/uncommitted_work.txt"

  for repo in $REPOS; do
    local short=$(echo "$repo" | cut -d/ -f2)
@@ -72,24 +71,6 @@ for p in json.load(sys.stdin):
  tail -50 "/tmp/kimi-heartbeat.log" 2>/dev/null | grep -c "FAILED:" | xargs -I{} echo "Kimi recent failures: {}" >> "$state_dir/agent_status.txt"
  tail -1 "/tmp/kimi-heartbeat.log" 2>/dev/null | xargs -I{} echo "Kimi last event: {}" >> "$state_dir/agent_status.txt"

-  # Scan worktrees for uncommitted work
-  for wt_dir in "$HOME/worktrees"/*/; do
-    [ -d "$wt_dir" ] || continue
-    [ -d "$wt_dir/.git" ] || continue
-    local dirty
-    dirty=$(cd "$wt_dir" && git status --porcelain 2>/dev/null | wc -l | tr -d " ")
-    if [ "${dirty:-0}" -gt 0 ]; then
-      local branch
-      branch=$(cd "$wt_dir" && git branch --show-current 2>/dev/null || echo "?")
-      local age=""
-      local last_commit
-      last_commit=$(cd "$wt_dir" && git log -1 --format=%ct 2>/dev/null || echo 0)
-      local now=$(date +%s)
-      local stale_mins=$(( (now - last_commit) / 60 ))
-      echo "DIR=$wt_dir BRANCH=$branch DIRTY=$dirty STALE=${stale_mins}m" >> "$state_dir/uncommitted_work.txt"
-    fi
-  done
-
  echo "$state_dir"
 }

@@ -100,25 +81,6 @@ run_triage() {

  log "Cycle: $unassigned_count unassigned, $pr_count open PRs"

-  # Check for uncommitted work — nag if stale
-  local uncommitted_count
-  uncommitted_count=$(wc -l < "$state_dir/uncommitted_work.txt" 2>/dev/null | tr -d " " || echo 0)
-  if [ "${uncommitted_count:-0}" -gt 0 ]; then
-    log "WARNING: $uncommitted_count worktree(s) with uncommitted work"
-    while IFS= read -r line; do
-      log "  UNCOMMITTED: $line"
-      # Auto-commit stale work (>60 min without commit)
-      local stale=$(echo "$line" | sed 's/.*STALE=\([0-9]*\)m.*/\1/')
-      local wt_dir=$(echo "$line" | sed 's/.*DIR=\([^ ]*\) .*/\1/')
-      if [ "${stale:-0}" -gt 60 ]; then
-        log "  AUTO-COMMITTING stale work in $wt_dir (${stale}m stale)"
-        (cd "$wt_dir" && git add -A && git commit -m "WIP: orchestrator auto-commit — ${stale}m stale work
-
-Preserved by timmy-orchestrator to prevent loss." 2>/dev/null &&           git push 2>/dev/null) && log "  COMMITTED: $wt_dir" || log "  COMMIT FAILED: $wt_dir"
-      fi
-    done < "$state_dir/uncommitted_work.txt"
-  fi
-
  # If nothing to do, skip the LLM call
  if [ "$unassigned_count" -eq 0 ] && [ "$pr_count" -eq 0 ]; then
    log "Nothing to triage"
@@ -236,12 +198,6 @@ FOOTER
 log "=== Timmy Orchestrator Started (PID $$) ==="
 log "Cycle: ${CYCLE_INTERVAL}s | Auto-assign: ${AUTO_ASSIGN_UNASSIGNED} | Inference surface: Hermes CLI"

-# Start auto-commit-guard daemon for work preservation
-if ! pgrep -f "auto-commit-guard.sh" >/dev/null 2>&1; then
-  nohup bash "$SCRIPT_DIR/auto-commit-guard.sh" 120 >> "$LOG_DIR/auto-commit-guard.log" 2>&1 &
-  log "Started auto-commit-guard daemon (PID $!)"
-fi
-
 WORKFORCE_CYCLE=0

 while true; do
--- a/deploy/auto-commit-guard.plist
+++ b/deploy/auto-commit-guard.plist
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>Label</key>
-    <string>ai.timmy.auto-commit-guard</string>
-    <key>ProgramArguments</key>
-    <array>
-        <string>/bin/bash</string>
-        <string>/Users/apayne/.hermes/bin/auto-commit-guard.sh</string>
-        <string>120</string>
-    </array>
-    <key>RunAtLoad</key>
-    <true/>
-    <key>KeepAlive</key>
-    <true/>
-    <key>StandardOutPath</key>
-    <string>/Users/apayne/.hermes/logs/auto-commit-guard.stdout.log</string>
-    <key>StandardErrorPath</key>
-    <string>/Users/apayne/.hermes/logs/auto-commit-guard.stderr.log</string>
-    <key>WorkingDirectory</key>
-    <string>/Users/apayne</string>
-</dict>
-</plist>