|
|
|
|
@@ -1,514 +0,0 @@
|
|
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
# pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart them
|
|
|
|
|
#
|
|
|
|
|
# Tracks output hash per pane across cycles. If a pane's captured output
|
|
|
|
|
# hasn't changed for STUCK_CYCLES consecutive checks, the pane is STUCK.
|
|
|
|
|
# Dead panes (PID gone) are also detected.
|
|
|
|
|
#
|
|
|
|
|
# On STUCK/DEAD:
|
|
|
|
|
# 1. Kill the pane
|
|
|
|
|
# 2. Attempt restart with --resume (session ID from manifest)
|
|
|
|
|
# 3. Fallback: fresh prompt with last known task from logs
|
|
|
|
|
#
|
|
|
|
|
# State file: ~/.hermes/pane-state.json
|
|
|
|
|
# Log: ~/.hermes/logs/pane-watchdog.log
|
|
|
|
|
#
|
|
|
|
|
# Usage:
|
|
|
|
|
# pane-watchdog.sh # One-shot check all sessions
|
|
|
|
|
# pane-watchdog.sh --daemon # Run every CHECK_INTERVAL seconds
|
|
|
|
|
# pane-watchdog.sh --status # Print current pane state
|
|
|
|
|
# pane-watchdog.sh --session NAME # Check only one session
|
|
|
|
|
#
|
|
|
|
|
# Issue: timmy-config #515
|
|
|
|
|
|
|
|
|
|
set -uo pipefail
|
|
|
|
|
export PATH="/opt/homebrew/bin:$HOME/.local/bin:$HOME/.hermes/bin:/usr/local/bin:$PATH"
|
|
|
|
|
|
|
|
|
|
# === CONFIG ===
|
|
|
|
|
STATE_FILE="${PANE_STATE_FILE:-$HOME/.hermes/pane-state.json}"
|
|
|
|
|
LOG_FILE="${PANE_WATCHDOG_LOG:-$HOME/.hermes/logs/pane-watchdog.log}"
|
|
|
|
|
CHECK_INTERVAL="${PANE_CHECK_INTERVAL:-120}" # seconds between cycles
|
|
|
|
|
STUCK_CYCLES=2 # unchanged cycles before STUCK
|
|
|
|
|
MAX_RESTART_ATTEMPTS=3 # per pane per hour
|
|
|
|
|
RESTART_COOLDOWN=3600 # seconds between escalation alerts
|
|
|
|
|
CAPTURE_LINES=40 # lines of output to hash
|
|
|
|
|
|
|
|
|
|
# Sessions to monitor (all if empty)
|
|
|
|
|
MONITOR_SESSIONS="${PANE_WATCHDOG_SESSIONS:-}"
|
|
|
|
|
|
|
|
|
|
mkdir -p "$(dirname "$STATE_FILE")" "$(dirname "$LOG_FILE")"
|
|
|
|
|
|
|
|
|
|
log() {
|
|
|
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === HELPERS ===
|
|
|
|
|
|
|
|
|
|
# Capture last N lines of pane output and hash them
|
|
|
|
|
capture_pane_hash() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local output
|
|
|
|
|
output=$(tmux capture-pane -t "$target" -p -S "-${CAPTURE_LINES}" 2>/dev/null || echo "DEAD")
|
|
|
|
|
echo -n "$output" | shasum -a 256 | cut -d' ' -f1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Check if pane PID is alive
|
|
|
|
|
pane_pid_alive() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local pid
|
|
|
|
|
pid=$(tmux list-panes -t "$target" -F '#{pane_pid}' 2>/dev/null | head -1 || echo "")
|
|
|
|
|
if [ -z "$pid" ]; then
|
|
|
|
|
return 1 # pane doesn't exist
|
|
|
|
|
fi
|
|
|
|
|
kill -0 "$pid" 2>/dev/null
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Get pane start command
|
|
|
|
|
pane_start_command() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
tmux list-panes -t "$target" -F '#{pane_start_command}' 2>/dev/null | head -1 || echo "unknown"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Get the pane's current running command (child process)
|
|
|
|
|
pane_current_command() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
tmux list-panes -t "$target" -F '#{pane_current_command}' 2>/dev/null || echo "unknown"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Only restart panes running hermes/agent commands (not zsh, python3 repls, etc.)
|
|
|
|
|
is_restartable() {
|
|
|
|
|
local cmd="$1"
|
|
|
|
|
case "$cmd" in
|
|
|
|
|
hermes|*hermes*|*agent*|*timmy*|*kimi*|*claude-loop*|*gemini-loop*)
|
|
|
|
|
return 0
|
|
|
|
|
;;
|
|
|
|
|
*)
|
|
|
|
|
return 1
|
|
|
|
|
;;
|
|
|
|
|
esac
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Get session ID from hermes manifest if available
|
|
|
|
|
get_hermes_session_id() {
|
|
|
|
|
local session_name="$1"
|
|
|
|
|
local manifest="$HOME/.hermes/sessions/${session_name}/manifest.json"
|
|
|
|
|
if [ -f "$manifest" ]; then
|
|
|
|
|
python3 -c "
|
|
|
|
|
import json, sys
|
|
|
|
|
try:
|
|
|
|
|
m = json.load(open('$manifest'))
|
|
|
|
|
print(m.get('session_id', m.get('id', '')))
|
|
|
|
|
except: pass
|
|
|
|
|
" 2>/dev/null || echo ""
|
|
|
|
|
else
|
|
|
|
|
echo ""
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Get last task from pane logs
|
|
|
|
|
get_last_task() {
|
|
|
|
|
local session_name="$1"
|
|
|
|
|
local log_dir="$HOME/.hermes/logs"
|
|
|
|
|
# Find the most recent log for this session
|
|
|
|
|
local log_file
|
|
|
|
|
log_file=$(find "$log_dir" -name "*${session_name}*" -type f -mtime -1 2>/dev/null | sort -r | head -1)
|
|
|
|
|
if [ -n "$log_file" ] && [ -f "$log_file" ]; then
|
|
|
|
|
# Extract last user prompt or task description
|
|
|
|
|
grep -i "task:\|prompt:\|issue\|working on" "$log_file" 2>/dev/null | tail -1 | sed 's/.*[:>] *//' | head -c 200
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Restart a pane with a fresh shell/command
|
|
|
|
|
restart_pane() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local session_name="${target%%:*}"
|
|
|
|
|
local session_id last_task cmd
|
|
|
|
|
|
|
|
|
|
log "RESTART: Attempting to restart $target"
|
|
|
|
|
|
|
|
|
|
# Kill existing pane
|
|
|
|
|
tmux kill-pane -t "$target" 2>/dev/null || true
|
|
|
|
|
sleep 1
|
|
|
|
|
|
|
|
|
|
# Try --resume with session ID
|
|
|
|
|
session_id=$(get_hermes_session_id "$session_name")
|
|
|
|
|
if [ -n "$session_id" ]; then
|
|
|
|
|
log "RESTART: Trying --resume with session $session_id"
|
|
|
|
|
tmux split-window -t "$session_name" -d \
|
|
|
|
|
"hermes chat --resume '$session_id' 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
|
|
|
|
|
sleep 2
|
|
|
|
|
if pane_pid_alive "${session_name}:1" 2>/dev/null; then
|
|
|
|
|
log "RESTART: Success with --resume"
|
|
|
|
|
return 0
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Fallback: fresh prompt
|
|
|
|
|
last_task=$(get_last_task "$session_name")
|
|
|
|
|
if [ -n "$last_task" ]; then
|
|
|
|
|
log "RESTART: Fallback — fresh prompt with task: $last_task"
|
|
|
|
|
tmux split-window -t "$session_name" -d \
|
|
|
|
|
"echo 'Watchdog restart — last task: $last_task' && hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
|
|
|
|
|
else
|
|
|
|
|
log "RESTART: Fallback — fresh hermes chat"
|
|
|
|
|
tmux split-window -t "$session_name" -d \
|
|
|
|
|
"hermes chat 2>&1 | tee -a '$HOME/.hermes/logs/${session_name}-restart.log'"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
sleep 2
|
|
|
|
|
if pane_pid_alive "${session_name}:1" 2>/dev/null; then
|
|
|
|
|
log "RESTART: Fallback restart succeeded"
|
|
|
|
|
return 0
|
|
|
|
|
else
|
|
|
|
|
log "RESTART: FAILED to restart $target"
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === STATE MANAGEMENT ===
|
|
|
|
|
|
|
|
|
|
read_state() {
|
|
|
|
|
if [ -f "$STATE_FILE" ]; then
|
|
|
|
|
cat "$STATE_FILE"
|
|
|
|
|
else
|
|
|
|
|
echo "{}"
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
write_state() {
|
|
|
|
|
echo "$1" > "$STATE_FILE"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Update state for a single pane and return JSON status
|
|
|
|
|
update_pane_state() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local hash="$2"
|
|
|
|
|
local is_alive="$3"
|
|
|
|
|
local now
|
|
|
|
|
now=$(date +%s)
|
|
|
|
|
|
|
|
|
|
python3 - "$STATE_FILE" "$target" "$hash" "$is_alive" "$now" "$STUCK_CYCLES" <<'PYEOF'
|
|
|
|
|
import json, sys, time
|
|
|
|
|
|
|
|
|
|
state_file = sys.argv[1]
|
|
|
|
|
target = sys.argv[2]
|
|
|
|
|
new_hash = sys.argv[3]
|
|
|
|
|
is_alive = sys.argv[4] == "true"
|
|
|
|
|
now = int(sys.argv[5])
|
|
|
|
|
stuck_cycles = int(sys.argv[6])
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with open(state_file) as f:
|
|
|
|
|
state = json.load(f)
|
|
|
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
|
|
|
state = {}
|
|
|
|
|
|
|
|
|
|
pane = state.get(target, {
|
|
|
|
|
"hash": "",
|
|
|
|
|
"same_count": 0,
|
|
|
|
|
"status": "UNKNOWN",
|
|
|
|
|
"last_change": 0,
|
|
|
|
|
"last_check": 0,
|
|
|
|
|
"restart_attempts": 0,
|
|
|
|
|
"last_restart": 0,
|
|
|
|
|
"current_command": "",
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if not is_alive:
|
|
|
|
|
pane["status"] = "DEAD"
|
|
|
|
|
pane["same_count"] = 0
|
|
|
|
|
elif new_hash == pane.get("hash", ""):
|
|
|
|
|
pane["same_count"] = pane.get("same_count", 0) + 1
|
|
|
|
|
if pane["same_count"] >= stuck_cycles:
|
|
|
|
|
pane["status"] = "STUCK"
|
|
|
|
|
else:
|
|
|
|
|
pane["status"] = "STALE" if pane["same_count"] > 0 else "OK"
|
|
|
|
|
else:
|
|
|
|
|
pane["hash"] = new_hash
|
|
|
|
|
pane["same_count"] = 0
|
|
|
|
|
pane["status"] = "OK"
|
|
|
|
|
pane["last_change"] = now
|
|
|
|
|
|
|
|
|
|
pane["last_check"] = now
|
|
|
|
|
state[target] = pane
|
|
|
|
|
|
|
|
|
|
with open(state_file, "w") as f:
|
|
|
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
|
|
|
|
print(json.dumps(pane))
|
|
|
|
|
PYEOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Reset restart attempt counter if cooldown expired
|
|
|
|
|
maybe_reset_restarts() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local now
|
|
|
|
|
now=$(date +%s)
|
|
|
|
|
|
|
|
|
|
python3 - "$STATE_FILE" "$target" "$now" "$RESTART_COOLDOWN" <<'PYEOF'
|
|
|
|
|
import json, sys
|
|
|
|
|
|
|
|
|
|
state_file = sys.argv[1]
|
|
|
|
|
target = sys.argv[2]
|
|
|
|
|
now = int(sys.argv[3])
|
|
|
|
|
cooldown = int(sys.argv[4])
|
|
|
|
|
|
|
|
|
|
with open(state_file) as f:
|
|
|
|
|
state = json.load(f)
|
|
|
|
|
|
|
|
|
|
pane = state.get(target, {})
|
|
|
|
|
last_restart = pane.get("last_restart", 0)
|
|
|
|
|
|
|
|
|
|
if now - last_restart > cooldown:
|
|
|
|
|
pane["restart_attempts"] = 0
|
|
|
|
|
|
|
|
|
|
state[target] = pane
|
|
|
|
|
with open(state_file, "w") as f:
|
|
|
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
|
|
|
|
print(pane.get("restart_attempts", 0))
|
|
|
|
|
PYEOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
increment_restart_attempt() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local now
|
|
|
|
|
now=$(date +%s)
|
|
|
|
|
|
|
|
|
|
python3 - "$STATE_FILE" "$target" "$now" <<'PYEOF'
|
|
|
|
|
import json, sys
|
|
|
|
|
|
|
|
|
|
state_file = sys.argv[1]
|
|
|
|
|
target = sys.argv[2]
|
|
|
|
|
now = int(sys.argv[3])
|
|
|
|
|
|
|
|
|
|
with open(state_file) as f:
|
|
|
|
|
state = json.load(f)
|
|
|
|
|
|
|
|
|
|
pane = state.get(target, {})
|
|
|
|
|
pane["restart_attempts"] = pane.get("restart_attempts", 0) + 1
|
|
|
|
|
pane["last_restart"] = now
|
|
|
|
|
pane["status"] = "RESTARTING"
|
|
|
|
|
|
|
|
|
|
state[target] = pane
|
|
|
|
|
with open(state_file, "w") as f:
|
|
|
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
|
|
|
|
print(pane["restart_attempts"])
|
|
|
|
|
PYEOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === CORE CHECK ===
|
|
|
|
|
|
|
|
|
|
check_pane() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local hash is_alive status current_cmd
|
|
|
|
|
|
|
|
|
|
# Capture state
|
|
|
|
|
hash=$(capture_pane_hash "$target")
|
|
|
|
|
if pane_pid_alive "$target"; then
|
|
|
|
|
is_alive="true"
|
|
|
|
|
else
|
|
|
|
|
is_alive="false"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Get current command for the pane
|
|
|
|
|
current_cmd=$(pane_current_command "$target")
|
|
|
|
|
|
|
|
|
|
# Update state and get result
|
|
|
|
|
local result
|
|
|
|
|
result=$(update_pane_state "$target" "$hash" "$is_alive")
|
|
|
|
|
status=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('status','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
|
|
|
|
|
|
|
|
|
|
case "$status" in
|
|
|
|
|
OK)
|
|
|
|
|
# Healthy, do nothing
|
|
|
|
|
;;
|
|
|
|
|
DEAD)
|
|
|
|
|
log "DETECTED: $target is DEAD (PID gone) cmd=$current_cmd"
|
|
|
|
|
if is_restartable "$current_cmd"; then
|
|
|
|
|
handle_stuck "$target"
|
|
|
|
|
else
|
|
|
|
|
log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
|
|
|
|
|
fi
|
|
|
|
|
;;
|
|
|
|
|
STUCK)
|
|
|
|
|
log "DETECTED: $target is STUCK (output unchanged for ${STUCK_CYCLES} cycles) cmd=$current_cmd"
|
|
|
|
|
if is_restartable "$current_cmd"; then
|
|
|
|
|
handle_stuck "$target"
|
|
|
|
|
else
|
|
|
|
|
log "SKIP: $target not a hermes pane (cmd=$current_cmd), not restarting"
|
|
|
|
|
fi
|
|
|
|
|
;;
|
|
|
|
|
STALE)
|
|
|
|
|
# Output unchanged but within threshold — just log
|
|
|
|
|
local count
|
|
|
|
|
count=$(echo "$result" | python3 -c "import json,sys; print(json.loads(sys.stdin.read()).get('same_count',0))" 2>/dev/null || echo "?")
|
|
|
|
|
log "STALE: $target unchanged for $count cycle(s)"
|
|
|
|
|
;;
|
|
|
|
|
esac
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
handle_stuck() {
|
|
|
|
|
local target="$1"
|
|
|
|
|
local session_name="${target%%:*}"
|
|
|
|
|
local attempts
|
|
|
|
|
|
|
|
|
|
# Check restart budget
|
|
|
|
|
attempts=$(maybe_reset_restarts "$target")
|
|
|
|
|
if [ "$attempts" -ge "$MAX_RESTART_ATTEMPTS" ]; then
|
|
|
|
|
log "ESCALATION: $target stuck ${attempts}x — manual intervention needed"
|
|
|
|
|
echo "ALERT: $target stuck after $attempts restart attempts" >&2
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
attempts=$(increment_restart_attempt "$target")
|
|
|
|
|
log "ACTION: Restarting $target (attempt $attempts/$MAX_RESTART_ATTEMPTS)"
|
|
|
|
|
|
|
|
|
|
if restart_pane "$target"; then
|
|
|
|
|
log "OK: $target restarted successfully"
|
|
|
|
|
else
|
|
|
|
|
log "FAIL: $target restart failed (attempt $attempts)"
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
check_all_sessions() {
|
|
|
|
|
local sessions
|
|
|
|
|
|
|
|
|
|
if [ -n "$MONITOR_SESSIONS" ]; then
|
|
|
|
|
IFS=',' read -ra sessions <<< "$MONITOR_SESSIONS"
|
|
|
|
|
else
|
|
|
|
|
sessions=()
|
|
|
|
|
while IFS= read -r line; do
|
|
|
|
|
[ -n "$line" ] && sessions+=("$line")
|
|
|
|
|
done < <(tmux list-sessions -F '#{session_name}' 2>/dev/null || true)
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
local total=0 stuck=0 dead=0 ok=0
|
|
|
|
|
for session in "${sessions[@]}"; do
|
|
|
|
|
[ -z "$session" ] && continue
|
|
|
|
|
# Get pane targets
|
|
|
|
|
local panes
|
|
|
|
|
panes=$(tmux list-panes -t "$session" -F "${session}:#{window_index}.#{pane_index}" 2>/dev/null || true)
|
|
|
|
|
for target in $panes; do
|
|
|
|
|
check_pane "$target"
|
|
|
|
|
total=$((total + 1))
|
|
|
|
|
done
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
log "CHECK: Processed $total panes"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === STATUS DISPLAY ===
|
|
|
|
|
|
|
|
|
|
show_status() {
|
|
|
|
|
if [ ! -f "$STATE_FILE" ]; then
|
|
|
|
|
echo "No pane state file found at $STATE_FILE"
|
|
|
|
|
echo "Run pane-watchdog.sh once to initialize."
|
|
|
|
|
exit 0
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
python3 - "$STATE_FILE" <<'PYEOF'
|
|
|
|
|
import json, sys, time
|
|
|
|
|
|
|
|
|
|
state_file = sys.argv[1]
|
|
|
|
|
try:
|
|
|
|
|
with open(state_file) as f:
|
|
|
|
|
state = json.load(f)
|
|
|
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
|
|
|
print("No state data yet.")
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
|
|
if not state:
|
|
|
|
|
print("No panes tracked.")
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
|
|
|
|
now = int(time.time())
|
|
|
|
|
print(f"{'PANE':<35} {'STATUS':<12} {'STALE':<6} {'LAST CHANGE':<15} {'RESTARTS'}")
|
|
|
|
|
print("-" * 90)
|
|
|
|
|
|
|
|
|
|
for target in sorted(state.keys()):
|
|
|
|
|
p = state[target]
|
|
|
|
|
status = p.get("status", "?")
|
|
|
|
|
same = p.get("same_count", 0)
|
|
|
|
|
last_change = p.get("last_change", 0)
|
|
|
|
|
restarts = p.get("restart_attempts", 0)
|
|
|
|
|
|
|
|
|
|
if last_change:
|
|
|
|
|
ago = now - last_change
|
|
|
|
|
if ago < 60:
|
|
|
|
|
change_str = f"{ago}s ago"
|
|
|
|
|
elif ago < 3600:
|
|
|
|
|
change_str = f"{ago//60}m ago"
|
|
|
|
|
else:
|
|
|
|
|
change_str = f"{ago//3600}h ago"
|
|
|
|
|
else:
|
|
|
|
|
change_str = "never"
|
|
|
|
|
|
|
|
|
|
# Color code
|
|
|
|
|
if status == "OK":
|
|
|
|
|
icon = "✓"
|
|
|
|
|
elif status == "STUCK":
|
|
|
|
|
icon = "✖"
|
|
|
|
|
elif status == "DEAD":
|
|
|
|
|
icon = "☠"
|
|
|
|
|
elif status == "STALE":
|
|
|
|
|
icon = "⏳"
|
|
|
|
|
else:
|
|
|
|
|
icon = "?"
|
|
|
|
|
|
|
|
|
|
print(f" {icon} {target:<32} {status:<12} {same:<6} {change_str:<15} {restarts}")
|
|
|
|
|
PYEOF
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === DAEMON MODE ===
|
|
|
|
|
|
|
|
|
|
run_daemon() {
|
|
|
|
|
log "DAEMON: Starting (interval=${CHECK_INTERVAL}s, stuck_threshold=${STUCK_CYCLES})"
|
|
|
|
|
echo "Pane watchdog started. Checking every ${CHECK_INTERVAL}s. Ctrl+C to stop."
|
|
|
|
|
echo "Log: $LOG_FILE"
|
|
|
|
|
echo "State: $STATE_FILE"
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
while true; do
|
|
|
|
|
check_all_sessions
|
|
|
|
|
sleep "$CHECK_INTERVAL"
|
|
|
|
|
done
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# === MAIN ===
|
|
|
|
|
|
|
|
|
|
case "${1:-}" in
|
|
|
|
|
--daemon)
|
|
|
|
|
run_daemon
|
|
|
|
|
;;
|
|
|
|
|
--status)
|
|
|
|
|
show_status
|
|
|
|
|
;;
|
|
|
|
|
--session)
|
|
|
|
|
if [ -z "${2:-}" ]; then
|
|
|
|
|
echo "Usage: pane-watchdog.sh --session SESSION_NAME"
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
MONITOR_SESSIONS="$2"
|
|
|
|
|
check_all_sessions
|
|
|
|
|
;;
|
|
|
|
|
--help|-h)
|
|
|
|
|
echo "pane-watchdog.sh — Detect stuck/dead tmux panes and auto-restart"
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Usage:"
|
|
|
|
|
echo " pane-watchdog.sh # One-shot check"
|
|
|
|
|
echo " pane-watchdog.sh --daemon # Continuous monitoring"
|
|
|
|
|
echo " pane-watchdog.sh --status # Show pane state"
|
|
|
|
|
echo " pane-watchdog.sh --session S # Check one session"
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Config (env vars):"
|
|
|
|
|
echo " PANE_CHECK_INTERVAL Seconds between checks (default: 120)"
|
|
|
|
|
echo " PANE_WATCHDOG_SESSIONS Comma-separated session names"
|
|
|
|
|
echo " PANE_STATE_FILE State file path"
|
|
|
|
|
echo " STUCK_CYCLES Unchanged cycles before STUCK (default: 2)"
|
|
|
|
|
;;
|
|
|
|
|
*)
|
|
|
|
|
check_all_sessions
|
|
|
|
|
;;
|
|
|
|
|
esac
|