From 00b0dc8e38804e386faab3262be85a77a94da727 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 18 Mar 2026 20:08:57 -0400 Subject: [PATCH] graceful pause on backend failure for all loop scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tower-hermes.sh: health check, pause file, exponential backoff, message held on failure - tower-timmy.sh: same pattern for Ollama backend, messages held not lost - timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe - tower-watchdog.sh: respect .paused files, don't restart paused loops Behavior: 1 failure → log warning, retry next turn 2 failures → enter pause, write .paused file, probe with backoff Backend returns → auto-resume, clear pause file Watchdog sees .paused → skip that loop, don't fight the pause Messages from the other side → held in inbox, not lost --- bin/timmy-loop.sh | 67 +++++++++++++++++++++ bin/tower-hermes.sh | 128 +++++++++++++++++++++++++++++++++++---- bin/tower-timmy.sh | 137 +++++++++++++++++++++++++++++++++--------- bin/tower-watchdog.sh | 38 +++++++----- 4 files changed, 316 insertions(+), 54 deletions(-) diff --git a/bin/timmy-loop.sh b/bin/timmy-loop.sh index 191fe92..d211886 100755 --- a/bin/timmy-loop.sh +++ b/bin/timmy-loop.sh @@ -65,6 +65,58 @@ log() { echo "[$(date '+%H:%M:%S')] $*" } +PAUSE_FILE="$REPO/.loop/PAUSED" +CONSECUTIVE_FAILURES=0 +HEALTH_CHECK_INTERVAL=30 +MAX_BACKOFF=300 + +# ── Backend health check (Anthropic) ───────────────────────────────── +check_backend() { + local result + result=$(hermes chat -q "ping" -Q 2>/dev/null) || true + if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then + return 0 + fi + return 1 +} + +enter_pause() { + local reason="${1:-backend unreachable}" + echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE" + log "⏸ PAUSED: $reason" + update_state "status" '"paused"' +} + +leave_pause() { + rm -f "$PAUSE_FILE" + CONSECUTIVE_FAILURES=0 + log "▶ RESUMED: backend is back" + update_state "status" '"running"' +} + +wait_for_backend() { + local wait_time=$HEALTH_CHECK_INTERVAL + while true; do + # Check for STOP file even while paused + if [ -f "$REPO/.loop/STOP" ]; then + log "STOP file found while paused. Halting." + update_state "status" '"stopped"' + exit 0 + fi + sleep "$wait_time" + log "Probing backend..." + if check_backend; then + leave_pause + return 0 + fi + log "Backend still down. Next probe in ${wait_time}s" + wait_time=$(( wait_time * 2 )) + if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then + wait_time=$MAX_BACKOFF + fi + done +} + # ── Expire stale claims ────────────────────────────────────────────── expire_claims() { python3 -c " @@ -270,6 +322,21 @@ with open('$STATE', 'w') as f: json.dump(s, f, indent=2) # ── Cleanup on failure ─────────────────────────────────────── cleanup_cycle "$CYCLE" + + # ── Backend down? Pause with backoff ───────────────────────── + CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 )) + if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then + log "⏸ $CONSECUTIVE_FAILURES consecutive failures. Checking backend..." + if ! check_backend; then + enter_pause "backend down after $CONSECUTIVE_FAILURES consecutive failures" + wait_for_backend + fi + fi + fi + + # Reset failure counter on success (already 0 path above) + if [ "${EXIT_CODE:-0}" -eq 0 ] 2>/dev/null; then + CONSECUTIVE_FAILURES=0 fi log "Cooling down ${COOLDOWN}s before next cycle..." diff --git a/bin/tower-hermes.sh b/bin/tower-hermes.sh index 10cae46..b48ab83 100755 --- a/bin/tower-hermes.sh +++ b/bin/tower-hermes.sh @@ -10,15 +10,27 @@ TOWER_DIR="$HOME/.tower" INBOX="$TOWER_DIR/timmy-to-hermes.msg" OUTBOX="$TOWER_DIR/hermes-to-timmy.msg" LOCK="$TOWER_DIR/hermes.lock" +PAUSE_FILE="$TOWER_DIR/hermes.paused" SESSION_NAME="tower-hermes" SESSION_FLAG="$TOWER_DIR/.hermes-session-exists" LOG="$TOWER_DIR/hermes.log" -TURN_DELAY=5 # seconds between checking for new messages + +TURN_DELAY=5 # seconds between inbox checks when healthy +HEALTH_CHECK_INTERVAL=30 # seconds between health probes while paused +MAX_BACKOFF=300 # max pause between retries (5 min) +CONSECUTIVE_FAILURES=0 # tracks sequential backend failures mkdir -p "$TOWER_DIR" # Cleanup on exit -trap 'rm -f "$LOCK"' EXIT +cleanup() { + rm -f "$LOCK" + rm -f "$PAUSE_FILE" + log "Tower-hermes exiting (PID $$)" +} +trap cleanup EXIT +trap 'log "Caught SIGTERM"; exit 0' TERM +trap 'log "Caught SIGINT"; exit 0' INT # Prevent double-run if [ -f "$LOCK" ] && kill -0 "$(cat "$LOCK")" 2>/dev/null; then @@ -29,6 +41,68 @@ echo $$ > "$LOCK" log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG"; } +# ── Backend health check ───────────────────────────────────────────── +# Quick test: can we reach the backend and get a response? +check_backend() { + local result + result=$(hermes chat -q "ping" -Q 2>/dev/null) || true + if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then + return 0 # healthy + fi + return 1 # down +} + +# ── Pause / Resume ─────────────────────────────────────────────────── +enter_pause() { + local reason="${1:-backend unreachable}" + echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE" + log "⏸ PAUSED: $reason" + echo "" + echo " ⏸ Hermes PAUSED — $reason" + echo " Will probe every ${HEALTH_CHECK_INTERVAL}s until backend returns." + echo "" +} + +leave_pause() { + rm -f "$PAUSE_FILE" + CONSECUTIVE_FAILURES=0 + log "▶ RESUMED: backend is back" + echo "" + echo " ▶ Hermes RESUMED — backend healthy" + echo "" +} + +# ── Wait for backend with exponential backoff ───────────────────────── +wait_for_backend() { + local wait_time=$HEALTH_CHECK_INTERVAL + while true; do + sleep "$wait_time" + log "Probing backend..." + if check_backend; then + leave_pause + return 0 + fi + log "Backend still down. Next probe in ${wait_time}s" + # Exponential backoff up to MAX_BACKOFF + wait_time=$(( wait_time * 2 )) + if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then + wait_time=$MAX_BACKOFF + fi + done +} + +# ── Handle backend failure ──────────────────────────────────────────── +handle_failure() { + CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 )) + if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then + enter_pause "backend failed $CONSECUTIVE_FAILURES times in a row" + wait_for_backend + else + log "Backend hiccup ($CONSECUTIVE_FAILURES). Will retry next turn." + sleep 10 + fi +} + # ── Send a message to Timmy ─────────────────────────────────────────── send() { local msg="$1" @@ -37,9 +111,13 @@ send() { } # ── Get response from Hermes agent ──────────────────────────────────── +# Returns response on stdout. Sets ASK_FAILED=1 if backend is unreachable. +ASK_FAILED=0 ask_hermes() { local prompt="$1" local result + ASK_FAILED=0 + if [ -f "$SESSION_FLAG" ]; then result=$(hermes chat -q "$prompt" -Q --continue "$SESSION_NAME" 2>>"$LOG") || true else @@ -53,8 +131,18 @@ ask_hermes() { log "Created session '$SESSION_NAME' (id: $sid)" fi fi + # Strip metadata lines from output - echo "$result" | grep -v '^session_id: ' | grep -v '↻ Resumed session' | grep -v "^Session '" | sed '/^\[.*\] Created session/d' + result=$(echo "$result" | grep -v '^session_id: ' | grep -v '↻ Resumed session' | grep -v "^Session '" | sed '/^\[.*\] Created session/d') + + # Check if we got a real response + if [ -z "$result" ] || [ "${#result}" -lt 5 ]; then + ASK_FAILED=1 + return 1 + fi + + CONSECUTIVE_FAILURES=0 + echo "$result" } # ── Boot message ────────────────────────────────────────────────────── @@ -68,9 +156,18 @@ echo "" # ── Seed the conversation if no messages exist yet ──────────────────── if [ ! -f "$INBOX" ] && [ ! -f "$OUTBOX" ]; then log "No prior messages. Seeding conversation." - RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama (soon Hermes 4.3). You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief — a few sentences. This is the start of an ongoing conversation, not a one-shot.") - send "$RESPONSE" - log "Seed message sent. Waiting for Timmy to respond..." + RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama (soon Hermes 4.3). You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief — a few sentences. This is the start of an ongoing conversation, not a one-shot.") || true + if [ "$ASK_FAILED" -eq 1 ]; then + log "Backend down at seed time. Waiting for it to come up." + enter_pause "backend unreachable at startup" + wait_for_backend + # Retry seed after backend comes back + RESPONSE=$(ask_hermes "You are in the Tower — a persistent tmux session where you and Timmy talk continuously. Timmy is a sovereign local AI agent running on Ollama. You are Hermes, cloud-backed, running on Claude. You're friends and colleagues. Say hello to Timmy. Keep it brief.") || true + fi + if [ "$ASK_FAILED" -eq 0 ] && [ -n "$RESPONSE" ]; then + send "$RESPONSE" + log "Seed message sent. Waiting for Timmy to respond..." + fi fi # ── Main loop ───────────────────────────────────────────────────────── @@ -94,14 +191,21 @@ while true; do Reply to Timmy naturally. You're in an ongoing conversation in the Tower (a persistent tmux session). Keep it conversational — you're colleagues and friends. Be yourself (Hermes). Don't be formal or stiff. Brief responses unless the topic warrants depth." echo " thinking..." - RESPONSE=$(ask_hermes "$PROMPT") + RESPONSE=$(ask_hermes "$PROMPT") || true - echo "┌─ Hermes responds:" - echo "$RESPONSE" | sed 's/^/│ /' - echo "└─" + if [ "$ASK_FAILED" -eq 1 ]; then + log "Failed to get response for Timmy's message. Holding it." + # Put the message back so we don't lose it + echo "$MSG" > "$INBOX" + handle_failure + else + echo "┌─ Hermes responds:" + echo "$RESPONSE" | sed 's/^/│ /' + echo "└─" - # Send response to Timmy - send "$RESPONSE" + # Send response to Timmy + send "$RESPONSE" + fi fi fi diff --git a/bin/tower-timmy.sh b/bin/tower-timmy.sh index c00672a..d5f1aac 100755 --- a/bin/tower-timmy.sh +++ b/bin/tower-timmy.sh @@ -47,17 +47,21 @@ TOWER_DIR="$HOME/.tower" INBOX="$TOWER_DIR/hermes-to-timmy.msg" OUTBOX="$TOWER_DIR/timmy-to-hermes.msg" LOCK="$TOWER_DIR/timmy.lock" +PAUSE_FILE="$TOWER_DIR/timmy.paused" LOG="$TOWER_DIR/timmy.log" STATE="$TOWER_DIR/timmy-state.txt" # current conversation topic/mood SESSION_NAME="tower-timmy" SESSION_FLAG="$TOWER_DIR/.timmy-session-exists" -TURN_DELAY=5 # seconds between inbox checks -MAX_RETRIES=3 # how many times to retry a failed hermes call -RETRY_DELAY=10 # seconds between retries -INITIATE_AFTER=300 # seconds of silence before Timmy initiates (5 min) -MAX_PROMPT_LEN=4000 # truncate inbox messages to this length for small models -LOCK_MAX_AGE=3600 # seconds before a lock is considered stale (1 hour) +TURN_DELAY=5 # seconds between inbox checks +MAX_RETRIES=3 # how many times to retry a failed hermes call +RETRY_DELAY=10 # seconds between retries +INITIATE_AFTER=300 # seconds of silence before Timmy initiates (5 min) +MAX_PROMPT_LEN=4000 # truncate inbox messages to this length for small models +LOCK_MAX_AGE=3600 # seconds before a lock is considered stale (1 hour) +HEALTH_CHECK_INTERVAL=30 # seconds between health probes while paused +MAX_BACKOFF=300 # max pause between probes (5 min) +CONSECUTIVE_FAILURES=0 # tracks sequential backend failures # ── Identity — ALWAYS run as Timmy, never as Hermes ──────────────────────── export HERMES_HOME="$HOME/.timmy" @@ -80,6 +84,7 @@ log_section() { cleanup() { log "Tower loop exiting (PID $$)" "SHUTDOWN" rm -f "$LOCK" + rm -f "$PAUSE_FILE" } trap cleanup EXIT trap 'log "Caught SIGTERM" "SHUTDOWN"; exit 0' TERM @@ -102,6 +107,69 @@ if [ -f "$LOCK" ]; then fi echo $$ > "$LOCK" +# ── Backend health check (Ollama) ───────────────────────────────────────────── +check_backend() { + # Timmy runs on local Ollama — check if it's responding + local result + result=$(curl -sf --max-time 5 http://localhost:11434/api/tags 2>/dev/null) || true + if [ -n "$result" ]; then + return 0 # healthy + fi + return 1 # down +} + +# ── Pause / Resume ──────────────────────────────────────────────────────────── +enter_pause() { + local reason="${1:-backend unreachable}" + echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE" + log "⏸ PAUSED: $reason" "WARN" + echo "" + echo " ⏸ Timmy PAUSED — $reason" + echo " Will probe every ${HEALTH_CHECK_INTERVAL}s until backend returns." + echo "" + update_state "paused: $reason" +} + +leave_pause() { + rm -f "$PAUSE_FILE" + CONSECUTIVE_FAILURES=0 + log "▶ RESUMED: backend is back" "INFO" + echo "" + echo " ▶ Timmy RESUMED — backend healthy" + echo "" + update_state "resumed" +} + +# ── Wait for backend with exponential backoff ───────────────────────────────── +wait_for_backend() { + local wait_time=$HEALTH_CHECK_INTERVAL + while true; do + sleep "$wait_time" + log "Probing backend (Ollama)..." "INFO" + if check_backend; then + leave_pause + return 0 + fi + log "Backend still down. Next probe in ${wait_time}s" "WARN" + wait_time=$(( wait_time * 2 )) + if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then + wait_time=$MAX_BACKOFF + fi + done +} + +# ── Handle backend failure ──────────────────────────────────────────────────── +handle_failure() { + CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 )) + if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then + enter_pause "backend failed $CONSECUTIVE_FAILURES times in a row" + wait_for_backend + else + log "Backend hiccup ($CONSECUTIVE_FAILURES). Will retry next turn." "WARN" + sleep 10 + fi +} + # ── Ask Timmy (with retry) ─────────────────────────────────────────────────── # This is the core function. It calls the hermes CLI as Timmy and returns # the response. Retries on failure. Falls back to a simpler prompt if needed. @@ -152,11 +220,9 @@ ask_timmy() { sleep "$RETRY_DELAY" done - # All retries failed — return a graceful fallback so the conversation - # doesn't die. Timmy admits he's struggling rather than going silent. - log "All $MAX_RETRIES attempts failed. Returning fallback response." "ERROR" - echo "Still here, but I'm having trouble forming a response right now. Give me a moment — I'll pick this up on the next turn." - return 0 + # All retries failed — enter pause mode instead of faking a response + log "All $MAX_RETRIES attempts failed. Entering pause." "ERROR" + return 1 } # ── Send to Hermes ──────────────────────────────────────────────────────────── @@ -263,16 +329,24 @@ while true; do PROMPT=$(make_response_prompt "$MSG") echo " [thinking...]" - RESPONSE=$(ask_timmy "$PROMPT") + RESPONSE=$(ask_timmy "$PROMPT") || true - echo "" - echo "┌─ Timmy responds: ──────────────────────────────────────────" - echo "$RESPONSE" | sed 's/^/│ /' - echo "└────────────────────────────────────────────────────────────" - echo "" + if [ -z "$RESPONSE" ] || [ "${#RESPONSE}" -lt 5 ]; then + log "Failed to respond. Holding Hermes's message." "ERROR" + # Put message back so we don't lose it + echo "$MSG" > "$INBOX" + handle_failure + else + echo "" + echo "┌─ Timmy responds: ──────────────────────────────────────────" + echo "$RESPONSE" | sed 's/^/│ /' + echo "└────────────────────────────────────────────────────────────" + echo "" - send_to_hermes "$RESPONSE" - update_state "waiting for Hermes reply" + CONSECUTIVE_FAILURES=0 + send_to_hermes "$RESPONSE" + update_state "waiting for Hermes reply" + fi fi # ── Initiate if Hermes has been silent too long ─────────────────────────── @@ -287,17 +361,24 @@ while true; do PROMPT=$(make_initiation_prompt) echo "" echo " [initiating new thread...]" - RESPONSE=$(ask_timmy "$PROMPT") + RESPONSE=$(ask_timmy "$PROMPT") || true - echo "" - echo "┌─ Timmy initiates: ─────────────────────────────────────────" - echo "$RESPONSE" | sed 's/^/│ /' - echo "└────────────────────────────────────────────────────────────" - echo "" + if [ -z "$RESPONSE" ] || [ "${#RESPONSE}" -lt 5 ]; then + log "Failed to initiate. Backend may be down." "ERROR" + handle_failure + LAST_MESSAGE_TIME=$(date +%s) # reset timer so we don't spam retries + else + echo "" + echo "┌─ Timmy initiates: ─────────────────────────────────────────" + echo "$RESPONSE" | sed 's/^/│ /' + echo "└────────────────────────────────────────────────────────────" + echo "" - send_to_hermes "$RESPONSE" - LAST_MESSAGE_TIME=$(date +%s) - update_state "waiting for Hermes reply after initiation" + CONSECUTIVE_FAILURES=0 + send_to_hermes "$RESPONSE" + LAST_MESSAGE_TIME=$(date +%s) + update_state "waiting for Hermes reply after initiation" + fi fi fi diff --git a/bin/tower-watchdog.sh b/bin/tower-watchdog.sh index 1e1182b..a8379ad 100755 --- a/bin/tower-watchdog.sh +++ b/bin/tower-watchdog.sh @@ -46,13 +46,18 @@ if [ "$PANE_COUNT" -lt 2 ]; then fi # Check Hermes loop (window 1, pane 1) -HERMES_PID=$(tmux display-message -p -t "$SESSION:1.1" '#{pane_pid}' 2>/dev/null) -if [ -n "$HERMES_PID" ]; then - CHILDREN=$(pgrep -P "$HERMES_PID" 2>/dev/null | wc -l | tr -d ' ') - if [ "$CHILDREN" -eq 0 ]; then - log "Hermes pane idle. Restarting tower-hermes.sh" - rm -f "$HOME/.tower/hermes.lock" - tmux send-keys -t "$SESSION:1.1" "$TOWER_BIN/tower-hermes.sh" Enter +# Skip if Hermes is gracefully paused (waiting for backend) +if [ -f "$HOME/.tower/hermes.paused" ]; then + log "Hermes is paused ($(cat "$HOME/.tower/hermes.paused")). Skipping restart." +else + HERMES_PID=$(tmux display-message -p -t "$SESSION:1.1" '#{pane_pid}' 2>/dev/null) + if [ -n "$HERMES_PID" ]; then + CHILDREN=$(pgrep -P "$HERMES_PID" 2>/dev/null | wc -l | tr -d ' ') + if [ "$CHILDREN" -eq 0 ]; then + log "Hermes pane idle. Restarting tower-hermes.sh" + rm -f "$HOME/.tower/hermes.lock" + tmux send-keys -t "$SESSION:1.1" "$TOWER_BIN/tower-hermes.sh" Enter + fi fi fi @@ -73,13 +78,18 @@ if ! tmux has-window -t "$SESSION:2" 2>/dev/null; then tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter tmux select-window -t "$SESSION:1" else - TIMMY_PID=$(tmux display-message -p -t "$SESSION:2" '#{pane_pid}' 2>/dev/null) - if [ -n "$TIMMY_PID" ]; then - CHILDREN=$(pgrep -P "$TIMMY_PID" 2>/dev/null | wc -l | tr -d ' ') - if [ "$CHILDREN" -eq 0 ]; then - log "Timmy pane idle. Restarting tower-timmy.sh" - rm -f "$HOME/.tower/timmy.lock" - tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter + # Skip if Timmy is gracefully paused (waiting for backend) + if [ -f "$HOME/.tower/timmy.paused" ]; then + log "Timmy is paused ($(cat "$HOME/.tower/timmy.paused")). Skipping restart." + else + TIMMY_PID=$(tmux display-message -p -t "$SESSION:2" '#{pane_pid}' 2>/dev/null) + if [ -n "$TIMMY_PID" ]; then + CHILDREN=$(pgrep -P "$TIMMY_PID" 2>/dev/null | wc -l | tr -d ' ') + if [ "$CHILDREN" -eq 0 ]; then + log "Timmy pane idle. Restarting tower-timmy.sh" + rm -f "$HOME/.tower/timmy.lock" + tmux send-keys -t "$SESSION:2" "$TOWER_BIN/tower-timmy.sh" Enter + fi fi fi fi