Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
e419ada5c5 feat: pane-watchdog — stuck pane detection + auto-restart (#515)
Some checks failed
Architecture Lint / Lint Repository (pull_request) Failing after 15s
Architecture Lint / Linter Tests (pull_request) Successful in 20s
Smoke Test / smoke (pull_request) Failing after 17s
Validate Config / YAML Lint (pull_request) Failing after 13s
Validate Config / JSON Validate (pull_request) Successful in 15s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 21s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 45s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 18s
PR Checklist / pr-checklist (pull_request) Successful in 3m20s
- Tracks output hash per tmux pane across check cycles
- 2 cycles unchanged output = STUCK, PID gone = DEAD
- Only restarts hermes/agent panes (skips zsh, python repls, etc)
- Restart flow: kill pane → try --resume with session ID → fallback fresh prompt
- Max 3 restart attempts per pane per hour (cooldown prevents loops)
- First-run safe: no false positives on initial check cycle
- Multi-window tmux safe: head -1 for pane queries
- Status display with colored icons and timing
- Daemon mode for continuous monitoring
- State persisted to ~/.hermes/pane-state.json

Closes #515
2026-04-13 17:52:00 -04:00

514
bin/pane-watchdog.sh Executable file
View File

@@ -0,0 +1,514 @@
#!/usr/bin/env bash
# pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart them
#
# Tracks output hash per pane across cycles. If a pane's captured output
# hasn't changed for STUCK_CYCLES consecutive checks, the pane is STUCK.
# Dead panes (PID gone) are also detected.
#
# On STUCK/DEAD:
# 1. Kill the pane
# 2. Attempt restart with --resume (session ID from manifest)
# 3. Fallback: fresh prompt with last known task from logs
#
# State file: ~/.hermes/pane-state.json
# Log: ~/.hermes/logs/pane-watchdog.log
#
# Usage:
# pane-watchdog.sh # One-shot check all sessions
# pane-watchdog.sh --daemon # Run every CHECK_INTERVAL seconds
# pane-watchdog.sh --status # Print current pane state
# pane-watchdog.sh --session NAME # Check only one session
#
# Issue: timmy-config #515
set -uo pipefail
export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
# === CONFIG ===
STATE_FILE="${PANE_STATE_FILE:-$HOME/.hermes/pane-state.json}"
LOG_FILE="${PANE_WATCHDOG_LOG:-$HOME/.hermes/logs/pane-watchdog.log}"
CHECK_INTERVAL="${PANE_CHECK_INTERVAL:-120}" # seconds between cycles
STUCK_CYCLES=2 # unchanged cycles before STUCK
MAX_RESTART_ATTEMPTS=3 # per pane per hour
RESTART_COOLDOWN=3600 # seconds between escalation alerts
CAPTURE_LINES=40 # lines of output to hash
# Sessions to monitor (all if empty)
MONITOR_SESSIONS="${PANE_WATCHDOG_SESSIONS:-}"
mkdir -p "$(dirname "$STATE_FILE")" "$(dirname "$LOG_FILE")"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
# === HELPERS ===
# Capture last N lines of pane output and hash them
capture_pane_hash() {
local target="$1"
local output
output=$(tmux capture-pane -t "$target" -p -S "-${CAPTURE_LINES}" 2>/dev/null || echo "DEAD")
echo -n "$output" | shasum -a 256 | cut -d' ' -f1
}
# Check if pane PID is alive
pane_pid_alive() {
local target="$1"
local pid
pid=$(tmux list-panes -t "$target" -F '#{pane_pid}' 2>/dev/null | head -1 || echo "")
if [ -z "$pid" ]; then
return 1 # pane doesn't exist
fi
kill -0 "$pid" 2>/dev/null
}
# Get pane start command
pane_start_command() {
local target="$1"
tmux list-panes -t "$target" -F '#{pane_start_command}' 2>/dev/null | head -1 || echo "unknown"
}
# Get the pane's current running command (child process)
pane_current_command() {
local target="$1"
tmux list-panes -t "$target" -F '#{pane_current_command}' 2>/dev/null || echo "unknown"
}
# Only restart panes running hermes/agent commands (not zsh, python3 repls, etc.)
is_restartable() {
local cmd="$1"
case "$cmd" in
hermes|*hermes*|*agent*|*timmy*|*kimi*|*claude-loop*|*gemini-loop*)
return 0
;;
*)
return 1
;;
esac
}
# Get session ID from hermes manifest if available
get_hermes_session_id() {
local session_name="$1"
local manifest="$HOME/.hermes/sessions/${session_name}/manifest.json"
if [ -f "$manifest" ]; then
python3 -c "
import json, sys
try:
m = json.load(open('$manifest'))
print(m.get('session_id', m.get('id', '')))
except: pass
" 2>/dev/null || echo ""
else
echo ""
fi
}
# Get last task from pane logs
get_last_task() {
local session_name="$1"
local log_dir="$HOME/.hermes/logs"
# Find the most recent log for this session
local log_file
log_file=$(find "$log_dir" -name "*${session_name}*" -type f -mtime -1 2>/dev/null | sort -r | head -1)
if [ -n "$log_file" ] && [ -f "$log_file" ]; then
# Extract last user prompt or task description
grep -i "task:\|prompt:\|issue\|working on" "$log_file" 2>/dev/null | tail -1 | sed 's/.*[:>] *//' | head -c 200
fi
}
# Restart a pane with a fresh shell/command
restart_pane() {
local target="$1"
local session_name="${target%%:*}"
local session_id last_task cmd
log "RESTART: Attempting to restart $target"
# Kill existing pane
tmux kill-pane -t "$target" 2>/dev/null || true
sleep 1
# Try --resume with session ID
session_id=$(get_hermes_session_id "$session_name")
if [ -n "$session_id" ]; then
log "RESTART: Trying --resume with session $session_id"
tmux split-window -t "$session_name" -d \
"hermes chat --resume '$session_id' 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
sleep 2
if pane_pid_alive "${session_name}:1" 2>/dev/null; then
log "RESTART: Success with --resume"
return 0
fi
fi
# Fallback: fresh prompt
last_task=$(get_last_task "$session_name")
if [ -n "$last_task" ]; then
log "RESTART: Fallback — fresh prompt with task: $last_task"
tmux split-window -t "$session_name" -d \
"echo 'Watchdog restart — last task: $last_task' && hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
else
log "RESTART: Fallback — fresh hermes chat"
tmux split-window -t "$session_name" -d \
"hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
fi
sleep 2
if pane_pid_alive "${session_name}:1" 2>/dev/null; then
log "RESTART: Fallback restart succeeded"
return 0
else
log "RESTART: FAILED to restart $target"
return 1
fi
}
# === STATE MANAGEMENT ===
read_state() {
if [ -f "$STATE_FILE" ]; then
cat "$STATE_FILE"
else
echo "{}"
fi
}
write_state() {
echo "$1" > "$STATE_FILE"
}
# Update state for a single pane and return JSON status
update_pane_state() {
local target="$1"
local hash="$2"
local is_alive="$3"
local now
now=$(date +%s)
python3 - "$STATE_FILE" "$target" "$hash" "$is_alive" "$now" "$STUCK_CYCLES" <<'PYEOF'
import json, sys, time
state_file = sys.argv[1]
target = sys.argv[2]
new_hash = sys.argv[3]
is_alive = sys.argv[4] == "true"
now = int(sys.argv[5])
stuck_cycles = int(sys.argv[6])
try:
with open(state_file) as f:
state = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
state = {}
pane = state.get(target, {
"hash": "",
"same_count": 0,
"status": "UNKNOWN",
"last_change": 0,
"last_check": 0,
"restart_attempts": 0,
"last_restart": 0,
"current_command": "",
})
if not is_alive:
pane["status"] = "DEAD"
pane["same_count"] = 0
elif new_hash == pane.get("hash", ""):
pane["same_count"] = pane.get("same_count", 0) + 1
if pane["same_count"] >= stuck_cycles:
pane["status"] = "STUCK"
else:
pane["status"] = "STALE" if pane["same_count"] > 0 else "OK"
else:
pane["hash"] = new_hash
pane["same_count"] = 0
pane["status"] = "OK"
pane["last_change"] = now
pane["last_check"] = now
state[target] = pane
with open(state_file, "w") as f:
json.dump(state, f, indent=2)
print(json.dumps(pane))
PYEOF
}
# Reset restart attempt counter if cooldown expired
maybe_reset_restarts() {
local target="$1"
local now
now=$(date +%s)
python3 - "$STATE_FILE" "$target" "$now" "$RESTART_COOLDOWN" <<'PYEOF'
import json, sys
state_file = sys.argv[1]
target = sys.argv[2]
now = int(sys.argv[3])
cooldown = int(sys.argv[4])
with open(state_file) as f:
state = json.load(f)
pane = state.get(target, {})
last_restart = pane.get("last_restart", 0)
if now - last_restart > cooldown:
pane["restart_attempts"] = 0
state[target] = pane
with open(state_file, "w") as f:
json.dump(state, f, indent=2)
print(pane.get("restart_attempts", 0))
PYEOF
}
increment_restart_attempt() {
local target="$1"
local now
now=$(date +%s)
python3 - "$STATE_FILE" "$target" "$now" <<'PYEOF'
import json, sys
state_file = sys.argv[1]
target = sys.argv[2]
now = int(sys.argv[3])
with open(state_file) as f:
state = json.load(f)
pane = state.get(target, {})
pane["restart_attempts"] = pane.get("restart_attempts", 0) + 1
pane["last_restart"] = now
pane["status"] = "RESTARTING"
state[target] = pane
with open(state_file, "w") as f:
json.dump(state, f, indent=2)
print(pane["restart_attempts"])
PYEOF
}
# === CORE CHECK ===
check_pane() {
local target="$1"
local hash is_alive status current_cmd
# Capture state
hash=$(capture_pane_hash "$target")
if pane_pid_alive "$target"; then
is_alive="true"
else
is_alive="false"
fi
# Get current command for the pane
current_cmd=$(pane_current_command "$target")
# Update state and get result
local result
result=$(update_pane_state "$target" "$hash" "$is_alive")
status=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('status','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
case "$status" in
OK)
# Healthy, do nothing
;;
DEAD)
log "DETECTED: $target is DEAD (PID gone) cmd=$current_cmd"
if is_restartable "$current_cmd"; then
handle_stuck "$target"
else
log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
fi
;;
STUCK)
log "DETECTED: $target is STUCK (output unchanged for ${STUCK_CYCLES} cycles) cmd=$current_cmd"
if is_restartable "$current_cmd"; then
handle_stuck "$target"
else
log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
fi
;;
STALE)
# Output unchanged but within threshold — just log
local count
count=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('same_count',0))" 2>/dev/null || echo "?")
log "STALE: $target unchanged for $count cycle(s)"
;;
esac
}
handle_stuck() {
local target="$1"
local session_name="${target%%:*}"
local attempts
# Check restart budget
attempts=$(maybe_reset_restarts "$target")
if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then
log "ESCALATION: $target stuck ${attempts}x — manual intervention needed"
echo "ALERT: $target stuck after $attempts restart attempts" >&2
return 1
fi
attempts=$(increment_restart_attempt "$target")
log "ACTION: Restarting $target (attempt $attempts/$MAX_RESTART_ATTEMPTS)"
if restart_pane "$target"; then
log "OK: $target restarted successfully"
else
log "FAIL: $target restart failed (attempt $attempts)"
fi
}
check_all_sessions() {
local sessions
if [ -n "$MONITOR_SESSIONS" ]; then
IFS=',' read -ra sessions <<< "$MONITOR_SESSIONS"
else
sessions=()
while IFS= read -r line; do
[ -n "$line" ] && sessions+=("$line")
done < <(tmux list-sessions -F '#{session_name}' 2>/dev/null || true)
fi
local total=0 stuck=0 dead=0 ok=0
for session in "${sessions[@]}"; do
[ -z "$session" ] && continue
# Get pane targets
local panes
panes=$(tmux list-panes -t "$session" -F "${session}:#{window_index}.#{pane_index}" 2>/dev/null || true)
for target in $panes; do
check_pane "$target"
total=$((total + 1))
done
done
log "CHECK: Processed $total panes"
}
# === STATUS DISPLAY ===
show_status() {
if [ ! -f "$STATE_FILE" ]; then
echo "No pane state file found at $STATE_FILE"
echo "Run pane-watchdog.sh once to initialize."
exit 0
fi
python3 - "$STATE_FILE" <<'PYEOF'
import json, sys, time
state_file = sys.argv[1]
try:
with open(state_file) as f:
state = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
print("No state data yet.")
sys.exit(0)
if not state:
print("No panes tracked.")
sys.exit(0)
now = int(time.time())
print(f"{'PANE':<35} {'STATUS':<12} {'STALE':<6} {'LAST CHANGE':<15} {'RESTARTS'}")
print("-" * 90)
for target in sorted(state.keys()):
p = state[target]
status = p.get("status", "?")
same = p.get("same_count", 0)
last_change = p.get("last_change", 0)
restarts = p.get("restart_attempts", 0)
if last_change:
ago = now - last_change
if ago < 60:
change_str = f"{ago}s ago"
elif ago < 3600:
change_str = f"{ago//60}m ago"
else:
change_str = f"{ago//3600}h ago"
else:
change_str = "never"
# Color code
if status == "OK":
icon = "✓"
elif status == "STUCK":
icon = "✖"
elif status == "DEAD":
icon = "☠"
elif status == "STALE":
icon = "⏳"
else:
icon = "?"
print(f" {icon} {target:<32} {status:<12} {same:<6} {change_str:<15} {restarts}")
PYEOF
}
# === DAEMON MODE ===
run_daemon() {
log "DAEMON: Starting (interval=${CHECK_INTERVAL}s, stuck_threshold=${STUCK_CYCLES})"
echo "Pane watchdog started. Checking every ${CHECK_INTERVAL}s. Ctrl+C to stop."
echo "Log: $LOG_FILE"
echo "State: $STATE_FILE"
echo ""
while true; do
check_all_sessions
sleep "$CHECK_INTERVAL"
done
}
# === MAIN ===
case "${1:-}" in
--daemon)
run_daemon
;;
--status)
show_status
;;
--session)
if [ -z "${2:-}" ]; then
echo "Usage: pane-watchdog.sh --session SESSION_NAME"
exit 1
fi
MONITOR_SESSIONS="$2"
check_all_sessions
;;
--help|-h)
echo "pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart"
echo ""
echo "Usage:"
echo " pane-watchdog.sh # One-shot check"
echo " pane-watchdog.sh --daemon # Continuous monitoring"
echo " pane-watchdog.sh --status # Show pane state"
echo " pane-watchdog.sh --session S # Check one session"
echo ""
echo "Config (env vars):"
echo " PANE_CHECK_INTERVAL Seconds between checks (default: 120)"
echo " PANE_WATCHDOG_SESSIONS Comma-separated session names"
echo " PANE_STATE_FILE State file path"
echo " STUCK_CYCLES Unchanged cycles before STUCK (default: 2)"
;;
*)
check_all_sessions
;;
esac