fix: auxiliary client payment fallback — retry with next provider on 402 (#5599)
When a user runs out of OpenRouter credits and switches to Codex (or any other provider), auxiliary tasks (compression, vision, web_extract) would still try OpenRouter first and fail with 402. Two fixes: 1. Payment fallback in call_llm(): When a resolved provider returns HTTP 402 or a credit-related error, automatically retry with the next available provider in the auto-detection chain. Skips the depleted provider and tries Nous → Custom → Codex → API-key providers. 2. Remove hardcoded OpenRouter fallback: The old code fell back specifically to OpenRouter when auto/custom resolution returned no client. Now falls back to the full auto-detection chain, which handles any available provider — not just OpenRouter. Also extracts _get_provider_chain() as a shared function (replaces inline tuple in _resolve_auto and the new fallback), built at call time so test patches on _try_* functions remain visible. Adds 16 tests covering _is_payment_error(), _get_provider_chain(), _try_payment_fallback(), and call_llm() integration with 402 retry.
This commit is contained in:
@@ -34,6 +34,12 @@ than the provider's default.
|
||||
Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
|
||||
AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
|
||||
custom OpenAI-compatible endpoint without touching the main model settings.
|
||||
|
||||
Payment / credit exhaustion fallback:
|
||||
When a resolved provider returns HTTP 402 or a credit-related error,
|
||||
call_llm() automatically retries with the next available provider in the
|
||||
auto-detection chain. This handles the common case where a user depletes
|
||||
their OpenRouter balance but has Codex OAuth or another provider available.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -874,10 +880,90 @@ _AUTO_PROVIDER_LABELS = {
|
||||
"_resolve_api_key_provider": "api-key",
|
||||
}
|
||||
|
||||
|
||||
_AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})
|
||||
|
||||
|
||||
def _get_provider_chain() -> List[tuple]:
|
||||
"""Return the ordered provider detection chain.
|
||||
|
||||
Built at call time (not module level) so that test patches
|
||||
on the ``_try_*`` functions are picked up correctly.
|
||||
"""
|
||||
return [
|
||||
("openrouter", _try_openrouter),
|
||||
("nous", _try_nous),
|
||||
("local/custom", _try_custom_endpoint),
|
||||
("openai-codex", _try_codex),
|
||||
("api-key", _resolve_api_key_provider),
|
||||
]
|
||||
|
||||
|
||||
def _is_payment_error(exc: Exception) -> bool:
|
||||
"""Detect payment/credit/quota exhaustion errors.
|
||||
|
||||
Returns True for HTTP 402 (Payment Required) and for 429/other errors
|
||||
whose message indicates billing exhaustion rather than rate limiting.
|
||||
"""
|
||||
status = getattr(exc, "status_code", None)
|
||||
if status == 402:
|
||||
return True
|
||||
err_lower = str(exc).lower()
|
||||
# OpenRouter and other providers include "credits" or "afford" in 402 bodies,
|
||||
# but sometimes wrap them in 429 or other codes.
|
||||
if status in (402, 429, None):
|
||||
if any(kw in err_lower for kw in ("credits", "insufficient funds",
|
||||
"can only afford", "billing",
|
||||
"payment required")):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _try_payment_fallback(
|
||||
failed_provider: str,
|
||||
task: str = None,
|
||||
) -> Tuple[Optional[Any], Optional[str], str]:
|
||||
"""Try alternative providers after a payment/credit error.
|
||||
|
||||
Iterates the standard auto-detection chain, skipping the provider that
|
||||
returned a payment error.
|
||||
|
||||
Returns:
|
||||
(client, model, provider_label) or (None, None, "") if no fallback.
|
||||
"""
|
||||
# Normalise the failed provider label for matching.
|
||||
skip = failed_provider.lower().strip()
|
||||
# Also skip Step-1 main-provider path if it maps to the same backend.
|
||||
# (e.g. main_provider="openrouter" → skip "openrouter" in chain)
|
||||
main_provider = _read_main_provider()
|
||||
skip_labels = {skip}
|
||||
if main_provider and main_provider.lower() in skip:
|
||||
skip_labels.add(main_provider.lower())
|
||||
# Map common resolved_provider values back to chain labels.
|
||||
_alias_to_label = {"openrouter": "openrouter", "nous": "nous",
|
||||
"openai-codex": "openai-codex", "codex": "openai-codex",
|
||||
"custom": "local/custom", "local/custom": "local/custom"}
|
||||
skip_chain_labels = {_alias_to_label.get(s, s) for s in skip_labels}
|
||||
|
||||
tried = []
|
||||
for label, try_fn in _get_provider_chain():
|
||||
if label in skip_chain_labels:
|
||||
continue
|
||||
client, model = try_fn()
|
||||
if client is not None:
|
||||
logger.info(
|
||||
"Auxiliary %s: payment error on %s — falling back to %s (%s)",
|
||||
task or "call", failed_provider, label, model or "default",
|
||||
)
|
||||
return client, model, label
|
||||
tried.append(label)
|
||||
|
||||
logger.warning(
|
||||
"Auxiliary %s: payment error on %s and no fallback available (tried: %s)",
|
||||
task or "call", failed_provider, ", ".join(tried),
|
||||
)
|
||||
return None, None, ""
|
||||
|
||||
|
||||
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
"""Full auto-detection chain.
|
||||
|
||||
@@ -905,10 +991,7 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||
|
||||
# ── Step 2: aggregator / fallback chain ──────────────────────────────
|
||||
tried = []
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_custom_endpoint,
|
||||
_try_codex, _resolve_api_key_provider):
|
||||
fn_name = getattr(try_fn, "__name__", "unknown")
|
||||
label = _AUTO_PROVIDER_LABELS.get(fn_name, fn_name)
|
||||
for label, try_fn in _get_provider_chain():
|
||||
client, model = try_fn()
|
||||
if client is not None:
|
||||
if tried:
|
||||
@@ -1786,12 +1869,15 @@ def call_llm(
|
||||
f"was found. Set the {_explicit.upper()}_API_KEY environment "
|
||||
f"variable, or switch to a different provider with `hermes model`."
|
||||
)
|
||||
# For auto/custom, fall back to OpenRouter
|
||||
# For auto/custom with no credentials, try the full auto chain
|
||||
# rather than hardcoding OpenRouter (which may be depleted).
|
||||
# Pass model=None so each provider uses its own default —
|
||||
# resolved_model may be an OpenRouter-format slug that doesn't
|
||||
# work on other providers.
|
||||
if not resolved_base_url:
|
||||
logger.info("Auxiliary %s: provider %s unavailable, falling back to openrouter",
|
||||
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
|
||||
task or "call", resolved_provider)
|
||||
client, final_model = _get_cached_client(
|
||||
"openrouter", resolved_model or _OPENROUTER_MODEL)
|
||||
client, final_model = _get_cached_client("auto")
|
||||
if client is None:
|
||||
raise RuntimeError(
|
||||
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
||||
@@ -1812,7 +1898,7 @@ def call_llm(
|
||||
tools=tools, timeout=effective_timeout, extra_body=extra_body,
|
||||
base_url=resolved_base_url)
|
||||
|
||||
# Handle max_tokens vs max_completion_tokens retry
|
||||
# Handle max_tokens vs max_completion_tokens retry, then payment fallback.
|
||||
try:
|
||||
return client.chat.completions.create(**kwargs)
|
||||
except Exception as first_err:
|
||||
@@ -1820,7 +1906,30 @@ def call_llm(
|
||||
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||
kwargs.pop("max_tokens", None)
|
||||
kwargs["max_completion_tokens"] = max_tokens
|
||||
return client.chat.completions.create(**kwargs)
|
||||
try:
|
||||
return client.chat.completions.create(**kwargs)
|
||||
except Exception as retry_err:
|
||||
# If the max_tokens retry also hits a payment error,
|
||||
# fall through to the payment fallback below.
|
||||
if not _is_payment_error(retry_err):
|
||||
raise
|
||||
first_err = retry_err
|
||||
|
||||
# ── Payment / credit exhaustion fallback ──────────────────────
|
||||
# When the resolved provider returns 402 or a credit-related error,
|
||||
# try alternative providers instead of giving up. This handles the
|
||||
# common case where a user runs out of OpenRouter credits but has
|
||||
# Codex OAuth or another provider available.
|
||||
if _is_payment_error(first_err):
|
||||
fb_client, fb_model, fb_label = _try_payment_fallback(
|
||||
resolved_provider, task)
|
||||
if fb_client is not None:
|
||||
fb_kwargs = _build_call_kwargs(
|
||||
fb_label, fb_model, messages,
|
||||
temperature=temperature, max_tokens=max_tokens,
|
||||
tools=tools, timeout=effective_timeout,
|
||||
extra_body=extra_body)
|
||||
return fb_client.chat.completions.create(**fb_kwargs)
|
||||
raise
|
||||
|
||||
|
||||
|
||||
@@ -14,8 +14,12 @@ from agent.auxiliary_client import (
|
||||
resolve_vision_provider_client,
|
||||
resolve_provider_client,
|
||||
auxiliary_max_tokens_param,
|
||||
call_llm,
|
||||
_read_codex_access_token,
|
||||
_get_auxiliary_provider,
|
||||
_get_provider_chain,
|
||||
_is_payment_error,
|
||||
_try_payment_fallback,
|
||||
_resolve_forced_provider,
|
||||
_resolve_auto,
|
||||
)
|
||||
@@ -1106,3 +1110,183 @@ class TestAuxiliaryMaxTokensParam:
|
||||
patch("agent.auxiliary_client._read_codex_access_token", return_value=None):
|
||||
result = auxiliary_max_tokens_param(1024)
|
||||
assert result == {"max_tokens": 1024}
|
||||
|
||||
|
||||
# ── Payment / credit exhaustion fallback ─────────────────────────────────
|
||||
|
||||
|
||||
class TestIsPaymentError:
|
||||
"""_is_payment_error detects 402 and credit-related errors."""
|
||||
|
||||
def test_402_status_code(self):
|
||||
exc = Exception("Payment Required")
|
||||
exc.status_code = 402
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_402_with_credits_message(self):
|
||||
exc = Exception("You requested up to 65535 tokens, but can only afford 8029")
|
||||
exc.status_code = 402
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_with_credits_message(self):
|
||||
exc = Exception("insufficient credits remaining")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_without_credits_message_is_not_payment(self):
|
||||
"""Normal rate limits should NOT be treated as payment errors."""
|
||||
exc = Exception("Rate limit exceeded, try again in 2 seconds")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is False
|
||||
|
||||
def test_generic_500_is_not_payment(self):
|
||||
exc = Exception("Internal server error")
|
||||
exc.status_code = 500
|
||||
assert _is_payment_error(exc) is False
|
||||
|
||||
def test_no_status_code_with_billing_message(self):
|
||||
exc = Exception("billing: payment required for this request")
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_no_status_code_no_message(self):
|
||||
exc = Exception("connection reset")
|
||||
assert _is_payment_error(exc) is False
|
||||
|
||||
|
||||
class TestGetProviderChain:
|
||||
"""_get_provider_chain() resolves functions at call time (testable)."""
|
||||
|
||||
def test_returns_five_entries(self):
|
||||
chain = _get_provider_chain()
|
||||
assert len(chain) == 5
|
||||
labels = [label for label, _ in chain]
|
||||
assert labels == ["openrouter", "nous", "local/custom", "openai-codex", "api-key"]
|
||||
|
||||
def test_picks_up_patched_functions(self):
|
||||
"""Patches on _try_* functions must be visible in the chain."""
|
||||
sentinel = lambda: ("patched", "model")
|
||||
with patch("agent.auxiliary_client._try_openrouter", sentinel):
|
||||
chain = _get_provider_chain()
|
||||
assert chain[0] == ("openrouter", sentinel)
|
||||
|
||||
|
||||
class TestTryPaymentFallback:
|
||||
"""_try_payment_fallback skips the failed provider and tries alternatives."""
|
||||
|
||||
def test_skips_failed_provider(self):
|
||||
mock_client = MagicMock()
|
||||
with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_nous", return_value=(mock_client, "nous-model")), \
|
||||
patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
|
||||
client, model, label = _try_payment_fallback("openrouter", task="compression")
|
||||
assert client is mock_client
|
||||
assert model == "nous-model"
|
||||
assert label == "nous"
|
||||
|
||||
def test_returns_none_when_no_fallback(self):
|
||||
with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_nous", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_codex", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
|
||||
client, model, label = _try_payment_fallback("openrouter")
|
||||
assert client is None
|
||||
assert label == ""
|
||||
|
||||
def test_codex_alias_maps_to_chain_label(self):
|
||||
"""'codex' should map to 'openai-codex' in the skip set."""
|
||||
mock_client = MagicMock()
|
||||
with patch("agent.auxiliary_client._try_openrouter", return_value=(mock_client, "or-model")), \
|
||||
patch("agent.auxiliary_client._try_codex", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._read_main_provider", return_value="openai-codex"):
|
||||
client, model, label = _try_payment_fallback("openai-codex", task="vision")
|
||||
assert client is mock_client
|
||||
assert label == "openrouter"
|
||||
|
||||
def test_skips_to_codex_when_or_and_nous_fail(self):
|
||||
mock_codex = MagicMock()
|
||||
with patch("agent.auxiliary_client._try_openrouter", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_nous", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
|
||||
patch("agent.auxiliary_client._try_codex", return_value=(mock_codex, "gpt-5.2-codex")), \
|
||||
patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"):
|
||||
client, model, label = _try_payment_fallback("openrouter")
|
||||
assert client is mock_codex
|
||||
assert model == "gpt-5.2-codex"
|
||||
assert label == "openai-codex"
|
||||
|
||||
|
||||
class TestCallLlmPaymentFallback:
|
||||
"""call_llm() retries with a different provider on 402 / payment errors."""
|
||||
|
||||
def _make_402_error(self, msg="Payment Required: insufficient credits"):
|
||||
exc = Exception(msg)
|
||||
exc.status_code = 402
|
||||
return exc
|
||||
|
||||
def test_402_triggers_fallback(self, monkeypatch):
|
||||
"""When the primary provider returns 402, call_llm tries the next one."""
|
||||
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||
|
||||
primary_client = MagicMock()
|
||||
primary_client.chat.completions.create.side_effect = self._make_402_error()
|
||||
|
||||
fallback_client = MagicMock()
|
||||
fallback_response = MagicMock()
|
||||
fallback_client.chat.completions.create.return_value = fallback_response
|
||||
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
|
||||
patch("agent.auxiliary_client._try_payment_fallback",
|
||||
return_value=(fallback_client, "gpt-5.2-codex", "openai-codex")) as mock_fb:
|
||||
result = call_llm(
|
||||
task="compression",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
)
|
||||
|
||||
assert result is fallback_response
|
||||
mock_fb.assert_called_once_with("openrouter", "compression")
|
||||
# Fallback call should use the fallback model
|
||||
fb_kwargs = fallback_client.chat.completions.create.call_args.kwargs
|
||||
assert fb_kwargs["model"] == "gpt-5.2-codex"
|
||||
|
||||
def test_non_payment_error_not_caught(self, monkeypatch):
|
||||
"""Non-payment errors (500, connection, etc.) should NOT trigger fallback."""
|
||||
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||
|
||||
primary_client = MagicMock()
|
||||
server_err = Exception("Internal Server Error")
|
||||
server_err.status_code = 500
|
||||
primary_client.chat.completions.create.side_effect = server_err
|
||||
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)):
|
||||
with pytest.raises(Exception, match="Internal Server Error"):
|
||||
call_llm(
|
||||
task="compression",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
)
|
||||
|
||||
def test_402_with_no_fallback_reraises(self, monkeypatch):
|
||||
"""When 402 hits and no fallback is available, the original error propagates."""
|
||||
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||
|
||||
primary_client = MagicMock()
|
||||
primary_client.chat.completions.create.side_effect = self._make_402_error()
|
||||
|
||||
with patch("agent.auxiliary_client._get_cached_client",
|
||||
return_value=(primary_client, "google/gemini-3-flash-preview")), \
|
||||
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||
return_value=("openrouter", "google/gemini-3-flash-preview", None, None)), \
|
||||
patch("agent.auxiliary_client._try_payment_fallback",
|
||||
return_value=(None, None, "")):
|
||||
with pytest.raises(Exception, match="insufficient credits"):
|
||||
call_llm(
|
||||
task="compression",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user