From 9f81c11ba08b25dc7dc0f7d7393db9bbd17000ec Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 17 Mar 2026 04:21:16 -0700
Subject: [PATCH] feat: eager fallback to backup model on rate-limit errors
 (#1730)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a fallback model is configured, switch to it immediately upon
detecting rate-limit conditions (429, quota exhaustion, empty/malformed
responses) instead of exhausting all retries with exponential backoff.

Two eager-fallback checks:
1. Invalid/empty API responses — fallback attempted before retry loop
2. HTTP 429 / rate-limit keyword detection — fallback before backoff

Both guarded by _fallback_activated for one-shot semantics.

Cherry-picked from PR #1413 by usvimal.

Co-authored-by: usvimal <usvimal@users.noreply.github.com>
---
 run_agent.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/run_agent.py b/run_agent.py
index 3a0938667..1133d0771 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -5132,6 +5132,13 @@ class AIAgent:
                         # This is often rate limiting or provider returning malformed response
                         retry_count += 1
                         
+                        # Eager fallback: empty/malformed responses are a common
+                        # rate-limit symptom.  Switch to fallback immediately
+                        # rather than retrying with extended backoff.
+                        if not self._fallback_activated and self._try_activate_fallback():
+                            retry_count = 0
+                            continue
+
                         # Check for error field in response (some providers include this)
                         error_msg = "Unknown"
                         provider_name = "Unknown"
@@ -5485,6 +5492,24 @@ class AIAgent:
                     # A 413 is a payload-size error — the correct response is to
                     # compress history and retry, not abort immediately.
                     status_code = getattr(api_error, "status_code", None)
+
+                    # Eager fallback for rate-limit errors (429 or quota exhaustion).
+                    # When a fallback model is configured, switch immediately instead
+                    # of burning through retries with exponential backoff -- the
+                    # primary provider won't recover within the retry window.
+                    is_rate_limited = (
+                        status_code == 429
+                        or "rate limit" in error_msg
+                        or "too many requests" in error_msg
+                        or "rate_limit" in error_msg
+                        or "usage limit" in error_msg
+                        or "quota" in error_msg
+                    )
+                    if is_rate_limited and not self._fallback_activated:
+                        if self._try_activate_fallback():
+                            retry_count = 0
+                            continue
+
                     is_payload_too_large = (
                         status_code == 413
                         or 'request entity too large' in error_msg