graceful pause on backend failure for all loop scripts

- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure
- tower-timmy.sh: same pattern for Ollama backend, messages held not lost
- timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe
- tower-watchdog.sh: respect .paused files, don't restart paused loops

Behavior:
  1 failure  → log warning, retry next turn
  2 failures → enter pause, write .paused file, probe with backoff
  Backend returns → auto-resume, clear pause file
  Watchdog sees .paused → skip that loop, don't fight the pause
  Messages from the other side → held in inbox, not lost
This commit is contained in:
Alexander Whitestone
2026-03-18 20:08:57 -04:00
parent 0c4a7356c0
commit 00b0dc8e38
4 changed files with 316 additions and 54 deletions

View File

@@ -65,6 +65,58 @@ log() {
echo "[$(date '+%H:%M:%S')] $*"
}
PAUSE_FILE="$REPO/.loop/PAUSED"
CONSECUTIVE_FAILURES=0
HEALTH_CHECK_INTERVAL=30
MAX_BACKOFF=300
# ── Backend health check (Anthropic) ─────────────────────────────────
check_backend() {
local result
result=$(hermes chat -q "ping" -Q 2>/dev/null) || true
if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then
return 0
fi
return 1
}
enter_pause() {
local reason="${1:-backend unreachable}"
echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
log "⏸ PAUSED: $reason"
update_state "status" '"paused"'
}
leave_pause() {
rm -f "$PAUSE_FILE"
CONSECUTIVE_FAILURES=0
log "▶ RESUMED: backend is back"
update_state "status" '"running"'
}
wait_for_backend() {
local wait_time=$HEALTH_CHECK_INTERVAL
while true; do
# Check for STOP file even while paused
if [ -f "$REPO/.loop/STOP" ]; then
log "STOP file found while paused. Halting."
update_state "status" '"stopped"'
exit 0
fi
sleep "$wait_time"
log "Probing backend..."
if check_backend; then
leave_pause
return 0
fi
log "Backend still down. Next probe in ${wait_time}s"
wait_time=$(( wait_time * 2 ))
if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
wait_time=$MAX_BACKOFF
fi
done
}
# ── Expire stale claims ──────────────────────────────────────────────
expire_claims() {
python3 -c "
@@ -270,6 +322,21 @@ with open('$STATE', 'w') as f: json.dump(s, f, indent=2)
# ── Cleanup on failure ───────────────────────────────────────
cleanup_cycle "$CYCLE"
# ── Backend down? Pause with backoff ─────────────────────────
CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
log "$CONSECUTIVE_FAILURES consecutive failures. Checking backend..."
if ! check_backend; then
enter_pause "backend down after $CONSECUTIVE_FAILURES consecutive failures"
wait_for_backend
fi
fi
fi
# Reset failure counter on success (already 0 path above)
if [ "${EXIT_CODE:-0}" -eq 0 ] 2>/dev/null; then
CONSECUTIVE_FAILURES=0
fi
log "Cooling down ${COOLDOWN}s before next cycle..."