graceful pause on backend failure for all loop scripts

- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure
- tower-timmy.sh: same pattern for Ollama backend, messages held not lost
- timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe
- tower-watchdog.sh: respect .paused files, don't restart paused loops

Behavior:
  1 failure  → log warning, retry next turn
  2 failures → enter pause, write .paused file, probe with backoff
  Backend returns → auto-resume, clear pause file
  Watchdog sees .paused → skip that loop, don't fight the pause
  Messages from the other side → held in inbox, not lost
This commit is contained in:
Alexander Whitestone
2026-03-18 20:08:57 -04:00
parent 0c4a7356c0
commit 00b0dc8e38
4 changed files with 316 additions and 54 deletions

View File

@@ -65,6 +65,58 @@ log() {
echo "[$(date '+%H:%M:%S')] $*"
}
PAUSE_FILE="$REPO/.loop/PAUSED"
CONSECUTIVE_FAILURES=0
HEALTH_CHECK_INTERVAL=30
MAX_BACKOFF=300
# ── Backend health check (Anthropic) ─────────────────────────────────
check_backend() {
local result
result=$(hermes chat -q "ping" -Q 2>/dev/null) || true
if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then
return 0
fi
return 1
}
enter_pause() {
local reason="${1:-backend unreachable}"
echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
log "⏸ PAUSED: $reason"
update_state "status" '"paused"'
}
leave_pause() {
rm -f "$PAUSE_FILE"
CONSECUTIVE_FAILURES=0
log "▶ RESUMED: backend is back"
update_state "status" '"running"'
}
wait_for_backend() {
local wait_time=$HEALTH_CHECK_INTERVAL
while true; do
# Check for STOP file even while paused
if [ -f "$REPO/.loop/STOP" ]; then
log "STOP file found while paused. Halting."
update_state "status" '"stopped"'
exit 0
fi
sleep "$wait_time"
log "Probing backend..."
if check_backend; then
leave_pause
return 0
fi
log "Backend still down. Next probe in ${wait_time}s"
wait_time=$(( wait_time * 2 ))
if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
wait_time=$MAX_BACKOFF
fi
done
}
# ── Expire stale claims ──────────────────────────────────────────────
expire_claims() {
python3 -c "
@@ -270,6 +322,21 @@ with open('$STATE', 'w') as f: json.dump(s, f, indent=2)
# ── Cleanup on failure ───────────────────────────────────────
cleanup_cycle "$CYCLE"
# ── Backend down? Pause with backoff ─────────────────────────
CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
log "$CONSECUTIVE_FAILURES consecutive failures. Checking backend..."
if ! check_backend; then
enter_pause "backend down after $CONSECUTIVE_FAILURES consecutive failures"
wait_for_backend
fi
fi
fi
# Reset failure counter on success (already 0 path above)
if [ "${EXIT_CODE:-0}" -eq 0 ] 2>/dev/null; then
CONSECUTIVE_FAILURES=0
fi
log "Cooling down ${COOLDOWN}s before next cycle..."

View File

@@ -10,15 +10,27 @@ TOWER_DIR="$HOME/.tower"
INBOX="$TOWER_DIR/timmy-to-hermes.msg"
OUTBOX="$TOWER_DIR/hermes-to-timmy.msg"
LOCK="$TOWER_DIR/hermes.lock"
PAUSE_FILE="$TOWER_DIR/hermes.paused"
SESSION_NAME="tower-hermes"
SESSION_FLAG="$TOWER_DIR/.hermes-session-exists"
LOG="$TOWER_DIR/hermes.log"
TURN_DELAY=5 # seconds between checking for new messages
TURN_DELAY=5 # seconds between inbox checks when healthy
HEALTH_CHECK_INTERVAL=30 # seconds between health probes while paused
MAX_BACKOFF=300 # max pause between retries (5 min)
CONSECUTIVE_FAILURES=0 # tracks sequential backend failures
mkdir -p "$TOWER_DIR"
# Cleanup on exit
trap 'rm -f "$LOCK"' EXIT
cleanup() {
rm -f "$LOCK"
rm -f "$PAUSE_FILE"
log "Tower-hermes exiting (PID $$)"
}
trap cleanup EXIT
trap 'log "Caught SIGTERM"; exit 0' TERM
trap 'log "Caught SIGINT"; exit 0' INT
# Prevent double-run
if [ -f "$LOCK" ] && kill -0 "$(cat "$LOCK")" 2>/dev/null; then
@@ -29,6 +41,68 @@ echo $$ > "$LOCK"
log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; }
# ── Backend health check ─────────────────────────────────────────────
# Quick test: can we reach the backend and get a response?
check_backend() {
local result
result=$(hermes chat -q "ping" -Q 2>/dev/null) || true
if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then
return 0 # healthy
fi
return 1 # down
}
# ── Pause / Resume ───────────────────────────────────────────────────
enter_pause() {
local reason="${1:-backend unreachable}"
echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
log "⏸ PAUSED: $reason"
echo ""
echo " ⏸ Hermes PAUSED — $reason"
echo " Will probe every ${HEALTH_CHECK_INTERVAL}s until backend returns."
echo ""
}
leave_pause() {
rm -f "$PAUSE_FILE"
CONSECUTIVE_FAILURES=0
log "▶ RESUMED: backend is back"
echo ""
echo " ▶ Hermes RESUMED — backend healthy"
echo ""
}
# ── Wait for backend with exponential backoff ─────────────────────────
wait_for_backend() {
local wait_time=$HEALTH_CHECK_INTERVAL
while true; do
sleep "$wait_time"
log "Probing backend..."
if check_backend; then
leave_pause
return 0
fi
log "Backend still down. Next probe in ${wait_time}s"
# Exponential backoff up to MAX_BACKOFF
wait_time=$(( wait_time * 2 ))
if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
wait_time=$MAX_BACKOFF
fi
done
}
# ── Handle backend failure ────────────────────────────────────────────
handle_failure() {
CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
enter_pause "backend failed $CONSECUTIVE_FAILURES times in a row"
wait_for_backend
else
log "Backend hiccup ($CONSECUTIVE_FAILURES). Will retry next turn."
sleep 10
fi
}
# ── Send a message to Timmy ───────────────────────────────────────────
send() {
local msg="$1"
@@ -37,9 +111,13 @@ send() {
}
# ── Get response from Hermes agent ────────────────────────────────────
# Returns response on stdout. Sets ASK_FAILED=1 if backend is unreachable.
ASK_FAILED=0
ask_hermes() {
local prompt="$1"
local result
ASK_FAILED=0
if [ -f "$SESSION_FLAG" ]; then
result=$(hermes chat -q "$prompt" -Q --continue "$SESSION_NAME" 2>>"$LOG") || true
else
@@ -53,8 +131,18 @@ ask_hermes() {
log "Created session '$SESSION_NAME' (id: $sid)"
fi
fi
# Strip metadata lines from output
echo "$result" | grep -v '^session_id: ' | grep -v '↻ Resumed session' | grep -v "^Session '" | sed '/^\[.*\] Created session/d'
result=$(echo "$result" | grep -v '^session_id: ' | grep -v '↻ Resumed session' | grep -v "^Session '" | sed '/^\[.*\] Created session/d')
# Check if we got a real response
if [ -z "$result" ] || [ "${#result}" -lt 5 ]; then
ASK_FAILED=1
return 1
fi
CONSECUTIVE_FAILURES=0
echo "$result"
}
# ── Boot message ──────────────────────────────────────────────────────
@@ -68,10 +156,19 @@ echo ""
# ── Seed the conversation if no messages exist yet ────────────────────
if [ ! -f "$INBOX" ] && [ ! -f "$OUTBOX" ]; then
log "No prior messages. Seeding conversation."
RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama (soon Hermes 4.3). You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief — a few sentences. This is the start of an ongoing conversation, not a one-shot.")
RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama (soon Hermes 4.3). You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief — a few sentences. This is the start of an ongoing conversation, not a one-shot.") || true
if [ "$ASK_FAILED" -eq 1 ]; then
log "Backend down at seed time. Waiting for it to come up."
enter_pause "backend unreachable at startup"
wait_for_backend
# Retry seed after backend comes back
RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama. You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief.") || true
fi
if [ "$ASK_FAILED" -eq 0 ] && [ -n "$RESPONSE" ]; then
send "$RESPONSE"
log "Seed message sent. Waiting for Timmy to respond..."
fi
fi
# ── Main loop ─────────────────────────────────────────────────────────
while true; do
@@ -94,8 +191,14 @@ while true; do
Reply to Timmy naturally. You're in an ongoing conversation in the Tower (a persistent tmux session). Keep it conversational — you're colleagues and friends. Be yourself (Hermes). Don't be formal or stiff. Brief responses unless the topic warrants depth."
echo " thinking..."
RESPONSE=$(ask_hermes "$PROMPT")
RESPONSE=$(ask_hermes "$PROMPT") || true
if [ "$ASK_FAILED" -eq 1 ]; then
log "Failed to get response for Timmy's message. Holding it."
# Put the message back so we don't lose it
echo "$MSG" > "$INBOX"
handle_failure
else
echo "┌─ Hermes responds:"
echo "$RESPONSE" | sed 's/^/│ /'
echo "└─"
@@ -104,6 +207,7 @@ Reply to Timmy naturally. You're in an ongoing conversation in the Tower (a pers
send "$RESPONSE"
fi
fi
fi
sleep "$TURN_DELAY"
done

View File

@@ -47,6 +47,7 @@ TOWER_DIR="$HOME/.tower"
INBOX="$TOWER_DIR/hermes-to-timmy.msg"
OUTBOX="$TOWER_DIR/timmy-to-hermes.msg"
LOCK="$TOWER_DIR/timmy.lock"
PAUSE_FILE="$TOWER_DIR/timmy.paused"
LOG="$TOWER_DIR/timmy.log"
STATE="$TOWER_DIR/timmy-state.txt" # current conversation topic/mood
SESSION_NAME="tower-timmy"
@@ -58,6 +59,9 @@ RETRY_DELAY=10 # seconds between retries
INITIATE_AFTER=300 # seconds of silence before Timmy initiates (5 min)
MAX_PROMPT_LEN=4000 # truncate inbox messages to this length for small models
LOCK_MAX_AGE=3600 # seconds before a lock is considered stale (1 hour)
HEALTH_CHECK_INTERVAL=30 # seconds between health probes while paused
MAX_BACKOFF=300 # max pause between probes (5 min)
CONSECUTIVE_FAILURES=0 # tracks sequential backend failures
# ── Identity — ALWAYS run as Timmy, never as Hermes ────────────────────────
export HERMES_HOME="$HOME/.timmy"
@@ -80,6 +84,7 @@ log_section() {
cleanup() {
log "Tower loop exiting (PID $$)" "SHUTDOWN"
rm -f "$LOCK"
rm -f "$PAUSE_FILE"
}
trap cleanup EXIT
trap 'log "Caught SIGTERM" "SHUTDOWN"; exit 0' TERM
@@ -102,6 +107,69 @@ if [ -f "$LOCK" ]; then
fi
echo $$ > "$LOCK"
# ── Backend health check (Ollama) ─────────────────────────────────────────────
check_backend() {
# Timmy runs on local Ollama — check if it's responding
local result
result=$(curl -sf --max-time 5 http://localhost:11434/api/tags 2>/dev/null) || true
if [ -n "$result" ]; then
return 0 # healthy
fi
return 1 # down
}
# ── Pause / Resume ────────────────────────────────────────────────────────────
enter_pause() {
local reason="${1:-backend unreachable}"
echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
log "⏸ PAUSED: $reason" "WARN"
echo ""
echo " ⏸ Timmy PAUSED — $reason"
echo " Will probe every ${HEALTH_CHECK_INTERVAL}s until backend returns."
echo ""
update_state "paused: $reason"
}
leave_pause() {
rm -f "$PAUSE_FILE"
CONSECUTIVE_FAILURES=0
log "▶ RESUMED: backend is back" "INFO"
echo ""
echo " ▶ Timmy RESUMED — backend healthy"
echo ""
update_state "resumed"
}
# ── Wait for backend with exponential backoff ─────────────────────────────────
wait_for_backend() {
local wait_time=$HEALTH_CHECK_INTERVAL
while true; do
sleep "$wait_time"
log "Probing backend (Ollama)..." "INFO"
if check_backend; then
leave_pause
return 0
fi
log "Backend still down. Next probe in ${wait_time}s" "WARN"
wait_time=$(( wait_time * 2 ))
if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
wait_time=$MAX_BACKOFF
fi
done
}
# ── Handle backend failure ────────────────────────────────────────────────────
handle_failure() {
CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
enter_pause "backend failed $CONSECUTIVE_FAILURES times in a row"
wait_for_backend
else
log "Backend hiccup ($CONSECUTIVE_FAILURES). Will retry next turn." "WARN"
sleep 10
fi
}
# ── Ask Timmy (with retry) ───────────────────────────────────────────────────
# This is the core function. It calls the hermes CLI as Timmy and returns
# the response. Retries on failure. Falls back to a simpler prompt if needed.
@@ -152,11 +220,9 @@ ask_timmy() {
sleep "$RETRY_DELAY"
done
# All retries failed — return a graceful fallback so the conversation
# doesn't die. Timmy admits he's struggling rather than going silent.
log "All $MAX_RETRIES attempts failed. Returning fallback response." "ERROR"
echo "Still here, but I'm having trouble forming a response right now. Give me a moment — I'll pick this up on the next turn."
return 0
# All retries failed — enter pause mode instead of faking a response
log "All $MAX_RETRIES attempts failed. Entering pause." "ERROR"
return 1
}
# ── Send to Hermes ────────────────────────────────────────────────────────────
@@ -263,17 +329,25 @@ while true; do
PROMPT=$(make_response_prompt "$MSG")
echo " [thinking...]"
RESPONSE=$(ask_timmy "$PROMPT")
RESPONSE=$(ask_timmy "$PROMPT") || true
if [ -z "$RESPONSE" ] || [ "${#RESPONSE}" -lt 5 ]; then
log "Failed to respond. Holding Hermes's message." "ERROR"
# Put message back so we don't lose it
echo "$MSG" > "$INBOX"
handle_failure
else
echo ""
echo "┌─ Timmy responds: ──────────────────────────────────────────"
echo "$RESPONSE" | sed 's/^/│ /'
echo "└────────────────────────────────────────────────────────────"
echo ""
CONSECUTIVE_FAILURES=0
send_to_hermes "$RESPONSE"
update_state "waiting for Hermes reply"
fi
fi
# ── Initiate if Hermes has been silent too long ───────────────────────────
else
@@ -287,19 +361,26 @@ while true; do
PROMPT=$(make_initiation_prompt)
echo ""
echo " [initiating new thread...]"
RESPONSE=$(ask_timmy "$PROMPT")
RESPONSE=$(ask_timmy "$PROMPT") || true
if [ -z "$RESPONSE" ] || [ "${#RESPONSE}" -lt 5 ]; then
log "Failed to initiate. Backend may be down." "ERROR"
handle_failure
LAST_MESSAGE_TIME=$(date +%s) # reset timer so we don't spam retries
else
echo ""
echo "┌─ Timmy initiates: ─────────────────────────────────────────"
echo "$RESPONSE" | sed 's/^/│ /'
echo "└────────────────────────────────────────────────────────────"
echo ""
CONSECUTIVE_FAILURES=0
send_to_hermes "$RESPONSE"
LAST_MESSAGE_TIME=$(date +%s)
update_state "waiting for Hermes reply after initiation"
fi
fi
fi
sleep "$TURN_DELAY"
done

View File

@@ -46,6 +46,10 @@ if [ "$PANE_COUNT" -lt 2 ]; then
fi
# Check Hermes loop (window 1, pane 1)
# Skip if Hermes is gracefully paused (waiting for backend)
if [ -f "$HOME/.tower/hermes.paused" ]; then
log "Hermes is paused ($(cat "$HOME/.tower/hermes.paused")). Skipping restart."
else
HERMES_PID=$(tmux display-message -p -t "$SESSION:1.1" '#{pane_pid}' 2>/dev/null)
if [ -n "$HERMES_PID" ]; then
CHILDREN=$(pgrep -P "$HERMES_PID" 2>/dev/null | wc -l | tr -d ' ')
@@ -55,6 +59,7 @@ if [ -n "$HERMES_PID" ]; then
tmux send-keys -t "$SESSION:1.1" "$TOWER_BIN/tower-hermes.sh" Enter
fi
fi
fi
# Check status pane (window 1, pane 2) — restart if dead
STATUS_PID=$(tmux display-message -p -t "$SESSION:1.2" '#{pane_pid}' 2>/dev/null)
@@ -72,6 +77,10 @@ if ! tmux has-window -t "$SESSION:2" 2>/dev/null; then
tmux new-window -t "$SESSION" -n "timmy-bg"
tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter
tmux select-window -t "$SESSION:1"
else
# Skip if Timmy is gracefully paused (waiting for backend)
if [ -f "$HOME/.tower/timmy.paused" ]; then
log "Timmy is paused ($(cat "$HOME/.tower/timmy.paused")). Skipping restart."
else
TIMMY_PID=$(tmux display-message -p -t "$SESSION:2" '#{pane_pid}' 2>/dev/null)
if [ -n "$TIMMY_PID" ]; then
@@ -83,6 +92,7 @@ else
fi
fi
fi
fi
# Trim log if > 1000 lines
if [ -f "$LOG" ] && [ "$(wc -l < "$LOG")" -gt 1000 ]; then