feat(providers): add ordered fallback provider chain (salvage #1761) (#3813)

Extends the single fallback_model mechanism into an ordered chain. When the primary model fails, Hermes tries each fallback provider in sequence until one succeeds or the chain is exhausted. Config format (new): fallback_providers: - provider: openrouter model: anthropic/claude-sonnet-4 - provider: openai model: gpt-4o Legacy single-dict fallback_model format still works unchanged. Key fix vs original PR: the call sites in the retry loop now use _fallback_index < len(_fallback_chain) instead of the old one-shot _fallback_activated guard, so the chain actually advances through all configured providers. Changes: - run_agent.py: _fallback_chain list + _fallback_index replaces one-shot _fallback_model; _try_activate_fallback() advances through chain; failed provider resolution skips to next entry; call sites updated to allow chain advancement - cli.py: reads fallback_providers with legacy fallback_model compat - gateway/run.py: same - hermes_cli/config.py: fallback_providers: [] in DEFAULT_CONFIG - tests: 12 new chain tests + 6 existing test fixtures updated Co-authored-by: uzaylisak <uzaylisak@users.noreply.github.com>
2026-03-29 16:04:53 -07:00
parent c774833667
commit 252fbea005
7 changed files with 220 additions and 31 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -896,16 +896,30 @@ class AIAgent:
            except Exception as e:
                raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
        
-        # Provider fallback — a single backup model/provider tried when the
-        # primary is exhausted (rate-limit, overload, connection failure).
-        # Config shape: {"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}
-        self._fallback_model = fallback_model if isinstance(fallback_model, dict) else None
+        # Provider fallback chain — ordered list of backup providers tried
+        # when the primary is exhausted (rate-limit, overload, connection
+        # failure).  Supports both legacy single-dict ``fallback_model`` and
+        # new list ``fallback_providers`` format.
+        if isinstance(fallback_model, list):
+            self._fallback_chain = [
+                f for f in fallback_model
+                if isinstance(f, dict) and f.get("provider") and f.get("model")
+            ]
+        elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"):
+            self._fallback_chain = [fallback_model]
+        else:
+            self._fallback_chain = []
+        self._fallback_index = 0
        self._fallback_activated = False
-        if self._fallback_model:
-            fb_p = self._fallback_model.get("provider", "")
-            fb_m = self._fallback_model.get("model", "")
-            if fb_p and fb_m and not self.quiet_mode:
-                print(f"🔄 Fallback model: {fb_m} ({fb_p})")
+        # Legacy attribute kept for backward compat (tests, external callers)
+        self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None
+        if self._fallback_chain and not self.quiet_mode:
+            if len(self._fallback_chain) == 1:
+                fb = self._fallback_chain[0]
+                print(f"🔄 Fallback model: {fb['model']} ({fb['provider']})")
+            else:
+                print(f"🔄 Fallback chain ({len(self._fallback_chain)} providers): " +
+                      " → ".join(f"{f['model']} ({f['provider']})" for f in self._fallback_chain))

        # Get available tools with filtering
        self.tools = get_tool_definitions(
@@ -4318,25 +4332,26 @@ class AIAgent:
    # ── Provider fallback ──────────────────────────────────────────────────

    def _try_activate_fallback(self) -> bool:
-        """Switch to the configured fallback model/provider.
+        """Switch to the next fallback model/provider in the chain.

-        Called when the primary model is failing after retries.  Swaps the
+        Called when the current model is failing after retries.  Swaps the
        OpenAI client, model slug, and provider in-place so the retry loop
-        can continue with the new backend.  One-shot: returns False if
-        already activated or not configured.
+        can continue with the new backend.  Advances through the chain on
+        each call; returns False when exhausted.

        Uses the centralized provider router (resolve_provider_client) for
        auth resolution and client construction — no duplicated provider→key
        mappings.
        """
-        if self._fallback_activated or not self._fallback_model:
+        if self._fallback_index >= len(self._fallback_chain):
            return False

-        fb = self._fallback_model
+        fb = self._fallback_chain[self._fallback_index]
+        self._fallback_index += 1
        fb_provider = (fb.get("provider") or "").strip().lower()
        fb_model = (fb.get("model") or "").strip()
        if not fb_provider or not fb_model:
-            return False
+            return self._try_activate_fallback()  # skip invalid, try next

        # Use centralized router for client construction.
        # raw_codex=True because the main agent needs direct responses.stream()
@@ -4349,7 +4364,7 @@ class AIAgent:
                logging.warning(
                    "Fallback to %s failed: provider not configured",
                    fb_provider)
-                return False
+                return self._try_activate_fallback()  # try next in chain

            # Determine api_mode from provider / base URL
            fb_api_mode = "chat_completions"
@@ -4424,8 +4439,8 @@ class AIAgent:
            )
            return True
        except Exception as e:
-            logging.error("Failed to activate fallback model: %s", e)
-            return False
+            logging.error("Failed to activate fallback %s: %s", fb_model, e)
+            return self._try_activate_fallback()  # try next in chain

    # ── End provider fallback ──────────────────────────────────────────────

@@ -6528,9 +6543,9 @@ class AIAgent:
                        # Eager fallback: empty/malformed responses are a common
                        # rate-limit symptom.  Switch to fallback immediately
                        # rather than retrying with extended backoff.
-                        if not self._fallback_activated:
+                        if self._fallback_index < len(self._fallback_chain):
                            self._emit_status("⚠️ Empty/malformed response — switching to fallback...")
-                        if not self._fallback_activated and self._try_activate_fallback():
+                        if self._try_activate_fallback():
                            retry_count = 0
                            continue

@@ -6993,7 +7008,7 @@ class AIAgent:
                        or "usage limit" in error_msg
                        or "quota" in error_msg
                    )
-                    if is_rate_limited and not self._fallback_activated:
+                    if is_rate_limited and self._fallback_index < len(self._fallback_chain):
                        self._emit_status("⚠️ Rate limited — switching to fallback provider...")
                        if self._try_activate_fallback():
                            retry_count = 0