fix: auxiliary client payment fallback — retry with next provider on 402 (#5599)

When a user runs out of OpenRouter credits and switches to Codex (or any other provider), auxiliary tasks (compression, vision, web_extract) would still try OpenRouter first and fail with 402. Two fixes: 1. Payment fallback in call_llm(): When a resolved provider returns HTTP 402 or a credit-related error, automatically retry with the next available provider in the auto-detection chain. Skips the depleted provider and tries Nous → Custom → Codex → API-key providers. 2. Remove hardcoded OpenRouter fallback: The old code fell back specifically to OpenRouter when auto/custom resolution returned no client. Now falls back to the full auto-detection chain, which handles any available provider — not just OpenRouter. Also extracts _get_provider_chain() as a shared function (replaces inline tuple in _resolve_auto and the new fallback), built at call time so test patches on _try_* functions remain visible. Adds 16 tests covering _is_payment_error(), _get_provider_chain(), _try_payment_fallback(), and call_llm() integration with 402 retry.
2026-04-06 12:41:40 -07:00
parent 8ffd44a6f9
commit da02a4e283
2 changed files with 304 additions and 11 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -34,6 +34,12 @@ than the provider's default.
 Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
 AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
 custom OpenAI-compatible endpoint without touching the main model settings.
+
+Payment / credit exhaustion fallback:
+  When a resolved provider returns HTTP 402 or a credit-related error,
+  call_llm() automatically retries with the next available provider in the
+  auto-detection chain.  This handles the common case where a user depletes
+  their OpenRouter balance but has Codex OAuth or another provider available.
 """

 import json
@@ -874,10 +880,90 @@ _AUTO_PROVIDER_LABELS = {
    "_resolve_api_key_provider": "api-key",
 }

-
 _AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})


+def _get_provider_chain() -> List[tuple]:
+    """Return the ordered provider detection chain.
+
+    Built at call time (not module level) so that test patches
+    on the ``_try_*`` functions are picked up correctly.
+    """
+    return [
+        ("openrouter", _try_openrouter),
+        ("nous", _try_nous),
+        ("local/custom", _try_custom_endpoint),
+        ("openai-codex", _try_codex),
+        ("api-key", _resolve_api_key_provider),
+    ]
+
+
+def _is_payment_error(exc: Exception) -> bool:
+    """Detect payment/credit/quota exhaustion errors.
+
+    Returns True for HTTP 402 (Payment Required) and for 429/other errors
+    whose message indicates billing exhaustion rather than rate limiting.
+    """
+    status = getattr(exc, "status_code", None)
+    if status == 402:
+        return True
+    err_lower = str(exc).lower()
+    # OpenRouter and other providers include "credits" or "afford" in 402 bodies,
+    # but sometimes wrap them in 429 or other codes.
+    if status in (402, 429, None):
+        if any(kw in err_lower for kw in ("credits", "insufficient funds",
+                                           "can only afford", "billing",
+                                           "payment required")):
+            return True
+    return False
+
+
+def _try_payment_fallback(
+    failed_provider: str,
+    task: str = None,
+) -> Tuple[Optional[Any], Optional[str], str]:
+    """Try alternative providers after a payment/credit error.
+
+    Iterates the standard auto-detection chain, skipping the provider that
+    returned a payment error.
+
+    Returns:
+        (client, model, provider_label) or (None, None, "") if no fallback.
+    """
+    # Normalise the failed provider label for matching.
+    skip = failed_provider.lower().strip()
+    # Also skip Step-1 main-provider path if it maps to the same backend.
+    # (e.g. main_provider="openrouter" → skip "openrouter" in chain)
+    main_provider = _read_main_provider()
+    skip_labels = {skip}
+    if main_provider and main_provider.lower() in skip:
+        skip_labels.add(main_provider.lower())
+    # Map common resolved_provider values back to chain labels.
+    _alias_to_label = {"openrouter": "openrouter", "nous": "nous",
+                       "openai-codex": "openai-codex", "codex": "openai-codex",
+                       "custom": "local/custom", "local/custom": "local/custom"}
+    skip_chain_labels = {_alias_to_label.get(s, s) for s in skip_labels}
+
+    tried = []
+    for label, try_fn in _get_provider_chain():
+        if label in skip_chain_labels:
+            continue
+        client, model = try_fn()
+        if client is not None:
+            logger.info(
+                "Auxiliary %s: payment error on %s — falling back to %s (%s)",
+                task or "call", failed_provider, label, model or "default",
+            )
+            return client, model, label
+        tried.append(label)
+
+    logger.warning(
+        "Auxiliary %s: payment error on %s and no fallback available (tried: %s)",
+        task or "call", failed_provider, ", ".join(tried),
+    )
+    return None, None, ""
+
+
 def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
    """Full auto-detection chain.

@@ -905,10 +991,7 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:

    # ── Step 2: aggregator / fallback chain ──────────────────────────────
    tried = []
-    for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
-                   _try_codex, _resolve_api_key_provider):
-        fn_name = getattr(try_fn, "__name__", "unknown")
-        label = _AUTO_PROVIDER_LABELS.get(fn_name, fn_name)
+    for label, try_fn in _get_provider_chain():
        client, model = try_fn()
        if client is not None:
            if tried:
@@ -1786,12 +1869,15 @@ def call_llm(
                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
                    f"variable, or switch to a different provider with `hermes model`."
                )
-            # For auto/custom, fall back to OpenRouter
+            # For auto/custom with no credentials, try the full auto chain
+            # rather than hardcoding OpenRouter (which may be depleted).
+            # Pass model=None so each provider uses its own default —
+            # resolved_model may be an OpenRouter-format slug that doesn't
+            # work on other providers.
            if not resolved_base_url:
-                logger.info("Auxiliary %s: provider %s unavailable, falling back to openrouter",
+                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
-                client, final_model = _get_cached_client(
-                    "openrouter", resolved_model or _OPENROUTER_MODEL)
+                client, final_model = _get_cached_client("auto")
        if client is None:
            raise RuntimeError(
                f"No LLM provider configured for task={task} provider={resolved_provider}. "
@@ -1812,7 +1898,7 @@ def call_llm(
        tools=tools, timeout=effective_timeout, extra_body=extra_body,
        base_url=resolved_base_url)

-    # Handle max_tokens vs max_completion_tokens retry
+    # Handle max_tokens vs max_completion_tokens retry, then payment fallback.
    try:
        return client.chat.completions.create(**kwargs)
    except Exception as first_err:
@@ -1820,7 +1906,30 @@ def call_llm(
        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
            kwargs.pop("max_tokens", None)
            kwargs["max_completion_tokens"] = max_tokens
-            return client.chat.completions.create(**kwargs)
+            try:
+                return client.chat.completions.create(**kwargs)
+            except Exception as retry_err:
+                # If the max_tokens retry also hits a payment error,
+                # fall through to the payment fallback below.
+                if not _is_payment_error(retry_err):
+                    raise
+                first_err = retry_err
+
+        # ── Payment / credit exhaustion fallback ──────────────────────
+        # When the resolved provider returns 402 or a credit-related error,
+        # try alternative providers instead of giving up.  This handles the
+        # common case where a user runs out of OpenRouter credits but has
+        # Codex OAuth or another provider available.
+        if _is_payment_error(first_err):
+            fb_client, fb_model, fb_label = _try_payment_fallback(
+                resolved_provider, task)
+            if fb_client is not None:
+                fb_kwargs = _build_call_kwargs(
+                    fb_label, fb_model, messages,
+                    temperature=temperature, max_tokens=max_tokens,
+                    tools=tools, timeout=effective_timeout,
+                    extra_body=extra_body)
+                return fb_client.chat.completions.create(**fb_kwargs)
        raise


--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -14,8 +14,12 @@ from agent.auxiliary_client import (
    resolve_vision_provider_client,
    resolve_provider_client,
    auxiliary_max_tokens_param,
+    call_llm,
    _read_codex_access_token,
    _get_auxiliary_provider,
+    _get_provider_chain,
+    _is_payment_error,
+    _try_payment_fallback,
    _resolve_forced_provider,
    _resolve_auto,
 )
@@ -1106,3 +1110,183 @@ class TestAuxiliaryMaxTokensParam:
             patch("agent.auxiliary_client._read_codex_access_token", return_value=None):
            result = auxiliary_max_tokens_param(1024)
        assert result == {"max_tokens": 1024}
+
+
+# ── Payment / credit exhaustion fallback ─────────────────────────────────
+
+
+class TestIsPaymentError:
+    """_is_payment_error detects 402 and credit-related errors."""
+
+    def test_402_status_code(self):
+        exc = Exception("Payment Required")
+        exc.status_code = 402
+        assert _is_payment_error(exc) is True
+
+    def test_402_with_credits_message(self):
+        exc = Exception("You requested up to 65535 tokens, but can only afford 8029")
+        exc.status_code = 402
+        assert _is_payment_error(exc) is True
+
+    def test_429_with_credits_message(self):
+        exc = Exception("insufficient credits remaining")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_without_credits_message_is_not_payment(self):
+        """Normal rate limits should NOT be treated as payment errors."""
+        exc = Exception("Rate limit exceeded, try again in 2 seconds")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is False
+
+    def test_generic_500_is_not_payment(self):
+        exc = Exception("Internal server error")
+        exc.status_code = 500
+        assert _is_payment_error(exc) is False
+
+    def test_no_status_code_with_billing_message(self):
+        exc = Exception("billing: payment required for this request")
+        assert _is_payment_error(exc) is True
+
+    def test_no_status_code_no_message(self):
+        exc = Exception("connection reset")
+        assert _is_payment_error(exc) is False
+
+
+class TestGetProviderChain:
+    """_get_provider_chain() resolves functions at call time (testable)."""
+
+    def test_returns_five_entries(self):
+        chain = _get_provider_chain()
+        assert len(chain) == 5
+        labels = [label for label, _ in chain]
+        assert labels == ["openrouter", "nous", "local/custom", "openai-codex", "api-key"]
+
+    def test_picks_up_patched_functions(self):
+        """Patches on _try_* functions must be visible in the chain."""
+        sentinel = lambda: ("patched", "model")
+        with patch("agent.auxiliary_client._try_openrouter", sentinel):
+            chain = _get_provider_chain()
+        assert chain[0] == ("openrouter", sentinel)
+
+
+class TestTryPaymentFallback:
+    """_try_payment_fallback skips the failed provider and tries alternatives."""
+
+    def test_skips_failed_provider(self):
+        mock_client = MagicMock()
+        with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_nous", return_value=(mock_client, "nous-model")), \
+             patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
+            client, model, label = _try_payment_fallback("openrouter", task="compression")
+        assert client is mock_client
+        assert model == "nous-model"
+        assert label == "nous"
+
+    def test_returns_none_when_no_fallback(self):
+        with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_nous", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_codex", return_value=(None, None)), \
+             patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)), \
+             patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
+            client, model, label = _try_payment_fallback("openrouter")
+        assert client is None
+        assert label == ""
+
+    def test_codex_alias_maps_to_chain_label(self):
+        """'codex' should map to 'openai-codex' in the skip set."""
+        mock_client = MagicMock()
+        with patch("agent.auxiliary_client._try_openrouter", return_value=(mock_client, "or-model")), \
+             patch("agent.auxiliary_client._try_codex", return_value=(None, None)), \
+             patch("agent.auxiliary_client._read_main_provider", return_value="openai-codex"):
+            client, model, label = _try_payment_fallback("openai-codex", task="vision")
+        assert client is mock_client
+        assert label == "openrouter"
+
+    def test_skips_to_codex_when_or_and_nous_fail(self):
+        mock_codex = MagicMock()
+        with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_nous", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
+             patch("agent.auxiliary_client._try_codex", return_value=(mock_codex, "gpt-5.2-codex")), \
+             patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
+            client, model, label = _try_payment_fallback("openrouter")
+        assert client is mock_codex
+        assert model == "gpt-5.2-codex"
+        assert label == "openai-codex"
+
+
+class TestCallLlmPaymentFallback:
+    """call_llm() retries with a different provider on 402 / payment errors."""
+
+    def _make_402_error(self, msg="Payment Required: insufficient credits"):
+        exc = Exception(msg)
+        exc.status_code = 402
+        return exc
+
+    def test_402_triggers_fallback(self, monkeypatch):
+        """When the primary provider returns 402, call_llm tries the next one."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        primary_client.chat.completions.create.side_effect = self._make_402_error()
+
+        fallback_client = MagicMock()
+        fallback_response = MagicMock()
+        fallback_client.chat.completions.create.return_value = fallback_response
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
+             patch("agent.auxiliary_client._try_payment_fallback",
+                    return_value=(fallback_client, "gpt-5.2-codex", "openai-codex")) as mock_fb:
+            result = call_llm(
+                task="compression",
+                messages=[{"role": "user", "content": "hello"}],
+            )
+
+        assert result is fallback_response
+        mock_fb.assert_called_once_with("openrouter", "compression")
+        # Fallback call should use the fallback model
+        fb_kwargs = fallback_client.chat.completions.create.call_args.kwargs
+        assert fb_kwargs["model"] == "gpt-5.2-codex"
+
+    def test_non_payment_error_not_caught(self, monkeypatch):
+        """Non-payment errors (500, connection, etc.) should NOT trigger fallback."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        server_err = Exception("Internal Server Error")
+        server_err.status_code = 500
+        primary_client.chat.completions.create.side_effect = server_err
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)):
+            with pytest.raises(Exception, match="Internal Server Error"):
+                call_llm(
+                    task="compression",
+                    messages=[{"role": "user", "content": "hello"}],
+                )
+
+    def test_402_with_no_fallback_reraises(self, monkeypatch):
+        """When 402 hits and no fallback is available, the original error propagates."""
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        primary_client.chat.completions.create.side_effect = self._make_402_error()
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                    return_value=(primary_client, "google/gemini-3-flash-preview")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                    return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
+             patch("agent.auxiliary_client._try_payment_fallback",
+                    return_value=(None, None, "")):
+            with pytest.raises(Exception, match="insufficient credits"):
+                call_llm(
+                    task="compression",
+                    messages=[{"role": "user", "content": "hello"}],
+                )