graceful pause on backend failure for all loop scripts

- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure - tower-timmy.sh: same pattern for Ollama backend, messages held not lost - timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe - tower-watchdog.sh: respect .paused files, don't restart paused loops Behavior: 1 failure → log warning, retry next turn 2 failures → enter pause, write .paused file, probe with backoff Backend returns → auto-resume, clear pause file Watchdog sees .paused → skip that loop, don't fight the pause Messages from the other side → held in inbox, not lost
2026-03-18 20:08:57 -04:00
parent 0c4a7356c0
commit 00b0dc8e38
4 changed files with 316 additions and 54 deletions
--- a/bin/timmy-loop.sh
+++ b/bin/timmy-loop.sh
@@ -65,6 +65,58 @@ log() {
    echo "[$(date '+%H:%M:%S')] $*"
 }

+PAUSE_FILE="$REPO/.loop/PAUSED"
+CONSECUTIVE_FAILURES=0
+HEALTH_CHECK_INTERVAL=30
+MAX_BACKOFF=300
+
+# ── Backend health check (Anthropic) ─────────────────────────────────
+check_backend() {
+    local result
+    result=$(hermes chat -q "ping" -Q 2>/dev/null) || true
+    if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then
+        return 0
+    fi
+    return 1
+}
+
+enter_pause() {
+    local reason="${1:-backend unreachable}"
+    echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
+    log "⏸  PAUSED: $reason"
+    update_state "status" '"paused"'
+}
+
+leave_pause() {
+    rm -f "$PAUSE_FILE"
+    CONSECUTIVE_FAILURES=0
+    log "▶  RESUMED: backend is back"
+    update_state "status" '"running"'
+}
+
+wait_for_backend() {
+    local wait_time=$HEALTH_CHECK_INTERVAL
+    while true; do
+        # Check for STOP file even while paused
+        if [ -f "$REPO/.loop/STOP" ]; then
+            log "STOP file found while paused. Halting."
+            update_state "status" '"stopped"'
+            exit 0
+        fi
+        sleep "$wait_time"
+        log "Probing backend..."
+        if check_backend; then
+            leave_pause
+            return 0
+        fi
+        log "Backend still down. Next probe in ${wait_time}s"
+        wait_time=$(( wait_time * 2 ))
+        if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
+            wait_time=$MAX_BACKOFF
+        fi
+    done
+}
+
 # ── Expire stale claims ──────────────────────────────────────────────
 expire_claims() {
    python3 -c "
@@ -270,6 +322,21 @@ with open('$STATE', 'w') as f: json.dump(s, f, indent=2)

        # ── Cleanup on failure ───────────────────────────────────────
        cleanup_cycle "$CYCLE"
+
+        # ── Backend down? Pause with backoff ─────────────────────────
+        CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
+        if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
+            log "⏸  $CONSECUTIVE_FAILURES consecutive failures. Checking backend..."
+            if ! check_backend; then
+                enter_pause "backend down after $CONSECUTIVE_FAILURES consecutive failures"
+                wait_for_backend
+            fi
+        fi
+    fi
+
+    # Reset failure counter on success (already 0 path above)
+    if [ "${EXIT_CODE:-0}" -eq 0 ] 2>/dev/null; then
+        CONSECUTIVE_FAILURES=0
    fi

    log "Cooling down ${COOLDOWN}s before next cycle..."