2026-03-15 20:15:33 -04:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
# ── Tower Watchdog ─────────────────────────────────────────────────────
|
2026-03-18 18:55:58 -04:00
|
|
|
# Ensures the tower session stays alive. Restarts dead panes/windows.
|
2026-03-15 20:15:33 -04:00
|
|
|
# Run via cron: */5 * * * * ~/hermes-config/bin/tower-watchdog.sh
|
|
|
|
|
#
|
2026-03-18 18:55:58 -04:00
|
|
|
# Layout:
|
|
|
|
|
# Window 1, Pane 1: tower-hermes.sh (conversation driver)
|
|
|
|
|
# Window 1, Pane 2: tower-status.sh (status dashboard)
|
|
|
|
|
# Window 2: tower-timmy.sh (Timmy loop, hidden)
|
|
|
|
|
#
|
2026-03-15 20:15:33 -04:00
|
|
|
# Source-controlled: gitea/rockachopa/hermes-config
|
|
|
|
|
# ───────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
SESSION="tower"
|
|
|
|
|
TOWER_BIN="$HOME/hermes-config/bin"
|
|
|
|
|
LOG="$HOME/.tower/watchdog.log"
|
|
|
|
|
|
|
|
|
|
mkdir -p "$HOME/.tower"
|
|
|
|
|
|
|
|
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG"; }
|
|
|
|
|
|
|
|
|
|
# If session doesn't exist at all, recreate it
|
|
|
|
|
if ! tmux has-session -t "$SESSION" 2>/dev/null; then
|
|
|
|
|
log "Session '$SESSION' missing. Recreating."
|
|
|
|
|
tmux new-session -d -s "$SESSION" -n "tower" -x 200 -y 50
|
2026-03-18 18:55:58 -04:00
|
|
|
tmux split-window -h -p 35 -t "$SESSION:1.1"
|
|
|
|
|
tmux select-pane -t "$SESSION:1.1" -T "⚡ Tower"
|
|
|
|
|
tmux select-pane -t "$SESSION:1.2" -T "📊 Status"
|
2026-03-15 20:15:33 -04:00
|
|
|
tmux send-keys -t "$SESSION:1.1" "$TOWER_BIN/tower-hermes.sh" Enter
|
2026-03-18 18:55:58 -04:00
|
|
|
tmux send-keys -t "$SESSION:1.2" "$TOWER_BIN/tower-status.sh" Enter
|
|
|
|
|
# Hidden window for Timmy
|
|
|
|
|
tmux new-window -t "$SESSION" -n "timmy-bg"
|
|
|
|
|
tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter
|
|
|
|
|
tmux select-window -t "$SESSION:1"
|
|
|
|
|
log "Session recreated (conversation + status + timmy-bg)."
|
2026-03-15 20:15:33 -04:00
|
|
|
exit 0
|
|
|
|
|
fi
|
|
|
|
|
|
2026-03-18 18:55:58 -04:00
|
|
|
# Session exists — check window 1 panes
|
2026-03-15 20:15:33 -04:00
|
|
|
PANE_COUNT=$(tmux list-panes -t "$SESSION:1" 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
|
|
|
|
|
|
if [ "$PANE_COUNT" -lt 2 ]; then
|
2026-03-18 18:55:58 -04:00
|
|
|
log "Window 1 has only $PANE_COUNT pane(s). Killing and recreating session."
|
2026-03-15 20:15:33 -04:00
|
|
|
tmux kill-session -t "$SESSION" 2>/dev/null
|
|
|
|
|
exec "$0" # re-run to recreate
|
|
|
|
|
fi
|
|
|
|
|
|
2026-03-18 18:55:58 -04:00
|
|
|
# Check Hermes loop (window 1, pane 1)
|
graceful pause on backend failure for all loop scripts
- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure
- tower-timmy.sh: same pattern for Ollama backend, messages held not lost
- timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe
- tower-watchdog.sh: respect .paused files, don't restart paused loops
Behavior:
1 failure → log warning, retry next turn
2 failures → enter pause, write .paused file, probe with backoff
Backend returns → auto-resume, clear pause file
Watchdog sees .paused → skip that loop, don't fight the pause
Messages from the other side → held in inbox, not lost
2026-03-18 20:08:57 -04:00
|
|
|
# Skip if Hermes is gracefully paused (waiting for backend)
|
|
|
|
|
if [ -f "$HOME/.tower/hermes.paused" ]; then
|
|
|
|
|
log "Hermes is paused ($(cat "$HOME/.tower/hermes.paused")). Skipping restart."
|
|
|
|
|
else
|
|
|
|
|
HERMES_PID=$(tmux display-message -p -t "$SESSION:1.1" '#{pane_pid}' 2>/dev/null)
|
|
|
|
|
if [ -n "$HERMES_PID" ]; then
|
|
|
|
|
CHILDREN=$(pgrep -P "$HERMES_PID" 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
|
if [ "$CHILDREN" -eq 0 ]; then
|
|
|
|
|
log "Hermes pane idle. Restarting tower-hermes.sh"
|
|
|
|
|
rm -f "$HOME/.tower/hermes.lock"
|
|
|
|
|
tmux send-keys -t "$SESSION:1.1" "$TOWER_BIN/tower-hermes.sh" Enter
|
|
|
|
|
fi
|
2026-03-15 20:15:33 -04:00
|
|
|
fi
|
2026-03-18 18:55:58 -04:00
|
|
|
fi
|
2026-03-15 20:15:33 -04:00
|
|
|
|
2026-03-18 18:55:58 -04:00
|
|
|
# Check status pane (window 1, pane 2) — restart if dead
|
|
|
|
|
STATUS_PID=$(tmux display-message -p -t "$SESSION:1.2" '#{pane_pid}' 2>/dev/null)
|
|
|
|
|
if [ -n "$STATUS_PID" ]; then
|
|
|
|
|
CHILDREN=$(pgrep -P "$STATUS_PID" 2>/dev/null | wc -l | tr -d ' ')
|
2026-03-15 20:15:33 -04:00
|
|
|
if [ "$CHILDREN" -eq 0 ]; then
|
2026-03-18 18:55:58 -04:00
|
|
|
log "Status pane idle. Restarting tower-status.sh"
|
|
|
|
|
tmux send-keys -t "$SESSION:1.2" "$TOWER_BIN/tower-status.sh" Enter
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Check Timmy loop (window 2)
|
|
|
|
|
if ! tmux has-window -t "$SESSION:2" 2>/dev/null; then
|
|
|
|
|
log "Timmy window missing. Recreating."
|
|
|
|
|
tmux new-window -t "$SESSION" -n "timmy-bg"
|
|
|
|
|
tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter
|
|
|
|
|
tmux select-window -t "$SESSION:1"
|
|
|
|
|
else
|
graceful pause on backend failure for all loop scripts
- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure
- tower-timmy.sh: same pattern for Ollama backend, messages held not lost
- timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe
- tower-watchdog.sh: respect .paused files, don't restart paused loops
Behavior:
1 failure → log warning, retry next turn
2 failures → enter pause, write .paused file, probe with backoff
Backend returns → auto-resume, clear pause file
Watchdog sees .paused → skip that loop, don't fight the pause
Messages from the other side → held in inbox, not lost
2026-03-18 20:08:57 -04:00
|
|
|
# Skip if Timmy is gracefully paused (waiting for backend)
|
|
|
|
|
if [ -f "$HOME/.tower/timmy.paused" ]; then
|
|
|
|
|
log "Timmy is paused ($(cat "$HOME/.tower/timmy.paused")). Skipping restart."
|
|
|
|
|
else
|
|
|
|
|
TIMMY_PID=$(tmux display-message -p -t "$SESSION:2" '#{pane_pid}' 2>/dev/null)
|
|
|
|
|
if [ -n "$TIMMY_PID" ]; then
|
|
|
|
|
CHILDREN=$(pgrep -P "$TIMMY_PID" 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
|
if [ "$CHILDREN" -eq 0 ]; then
|
|
|
|
|
log "Timmy pane idle. Restarting tower-timmy.sh"
|
|
|
|
|
rm -f "$HOME/.tower/timmy.lock"
|
|
|
|
|
tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter
|
|
|
|
|
fi
|
2026-03-15 20:15:33 -04:00
|
|
|
fi
|
|
|
|
|
fi
|
2026-03-18 18:55:58 -04:00
|
|
|
fi
|
2026-03-15 20:15:33 -04:00
|
|
|
|
|
|
|
|
# Trim log if > 1000 lines
|
|
|
|
|
if [ -f "$LOG" ] && [ "$(wc -l < "$LOG")" -gt 1000 ]; then
|
|
|
|
|
tail -500 "$LOG" > "$LOG.tmp" && mv "$LOG.tmp" "$LOG"
|
|
|
|
|
log "Log trimmed to 500 lines."
|
|
|
|
|
fi
|