graceful pause on backend failure for all loop scripts
- tower-hermes.sh: health check, pause file, exponential backoff, message held on failure - tower-timmy.sh: same pattern for Ollama backend, messages held not lost - timmy-loop.sh: pause after 2 consecutive cycle failures, backoff probe - tower-watchdog.sh: respect .paused files, don't restart paused loops Behavior: 1 failure → log warning, retry next turn 2 failures → enter pause, write .paused file, probe with backoff Backend returns → auto-resume, clear pause file Watchdog sees .paused → skip that loop, don't fight the pause Messages from the other side → held in inbox, not lost
This commit is contained in:
@@ -65,6 +65,58 @@ log() {
|
||||
echo "[$(date '+%H:%M:%S')] $*"
|
||||
}
|
||||
|
||||
PAUSE_FILE="$REPO/.loop/PAUSED"
|
||||
CONSECUTIVE_FAILURES=0
|
||||
HEALTH_CHECK_INTERVAL=30
|
||||
MAX_BACKOFF=300
|
||||
|
||||
# ── Backend health check (Anthropic) ─────────────────────────────────
|
||||
check_backend() {
|
||||
local result
|
||||
result=$(hermes chat -q "ping" -Q 2>/dev/null) || true
|
||||
if [ -n "$result" ] && [ "${#result}" -gt 2 ]; then
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
enter_pause() {
|
||||
local reason="${1:-backend unreachable}"
|
||||
echo "$reason (since $(date '+%H:%M:%S'))" > "$PAUSE_FILE"
|
||||
log "⏸ PAUSED: $reason"
|
||||
update_state "status" '"paused"'
|
||||
}
|
||||
|
||||
leave_pause() {
|
||||
rm -f "$PAUSE_FILE"
|
||||
CONSECUTIVE_FAILURES=0
|
||||
log "▶ RESUMED: backend is back"
|
||||
update_state "status" '"running"'
|
||||
}
|
||||
|
||||
wait_for_backend() {
|
||||
local wait_time=$HEALTH_CHECK_INTERVAL
|
||||
while true; do
|
||||
# Check for STOP file even while paused
|
||||
if [ -f "$REPO/.loop/STOP" ]; then
|
||||
log "STOP file found while paused. Halting."
|
||||
update_state "status" '"stopped"'
|
||||
exit 0
|
||||
fi
|
||||
sleep "$wait_time"
|
||||
log "Probing backend..."
|
||||
if check_backend; then
|
||||
leave_pause
|
||||
return 0
|
||||
fi
|
||||
log "Backend still down. Next probe in ${wait_time}s"
|
||||
wait_time=$(( wait_time * 2 ))
|
||||
if [ "$wait_time" -gt "$MAX_BACKOFF" ]; then
|
||||
wait_time=$MAX_BACKOFF
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# ── Expire stale claims ──────────────────────────────────────────────
|
||||
expire_claims() {
|
||||
python3 -c "
|
||||
@@ -270,6 +322,21 @@ with open('$STATE', 'w') as f: json.dump(s, f, indent=2)
|
||||
|
||||
# ── Cleanup on failure ───────────────────────────────────────
|
||||
cleanup_cycle "$CYCLE"
|
||||
|
||||
# ── Backend down? Pause with backoff ─────────────────────────
|
||||
CONSECUTIVE_FAILURES=$(( CONSECUTIVE_FAILURES + 1 ))
|
||||
if [ "$CONSECUTIVE_FAILURES" -ge 2 ]; then
|
||||
log "⏸ $CONSECUTIVE_FAILURES consecutive failures. Checking backend..."
|
||||
if ! check_backend; then
|
||||
enter_pause "backend down after $CONSECUTIVE_FAILURES consecutive failures"
|
||||
wait_for_backend
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Reset failure counter on success (already 0 path above)
|
||||
if [ "${EXIT_CODE:-0}" -eq 0 ] 2>/dev/null; then
|
||||
CONSECUTIVE_FAILURES=0
|
||||
fi
|
||||
|
||||
log "Cooling down ${COOLDOWN}s before next cycle..."
|
||||
|
||||
Reference in New Issue
Block a user