From ef5d811abac69725208a90062f2da6ac502ef3ea Mon Sep 17 00:00:00 2001 From: teknium1 Date: Mon, 9 Mar 2026 15:36:19 -0700 Subject: [PATCH] fix: vision auto-detection now falls back to custom/local endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vision auto-mode previously only tried OpenRouter, Nous, and Codex for multimodal — deliberately skipping custom endpoints with the assumption they 'may not handle vision input.' This caused silent failures for users running local multimodal models (Qwen-VL, LLaVA, Pixtral, etc.) without any cloud API keys. Now custom endpoints are tried as a last resort in auto mode. If the model doesn't support vision, the API call fails gracefully — but users with local vision models no longer need to manually set auxiliary.vision.provider: main in config.yaml. Reported by @Spadav and @kotyKD. --- agent/auxiliary_client.py | 10 +++++++--- tests/agent/test_auxiliary_client.py | 14 +++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index a32e3a293..57c3c1186 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -560,12 +560,16 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: forced = _get_auxiliary_provider("vision") if forced != "auto": return _resolve_forced_provider(forced) - # Auto: only multimodal-capable providers - for try_fn in (_try_openrouter, _try_nous, _try_codex): + # Auto: try providers known to support multimodal first, then fall + # back to the user's custom endpoint. Many local models (Qwen-VL, + # LLaVA, Pixtral, etc.) support vision — skipping them entirely + # caused silent failures for local-only users. + for try_fn in (_try_openrouter, _try_nous, _try_codex, + _try_custom_endpoint): client, model = try_fn() if client is not None: return client, model - logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous/Codex)") + logger.debug("Auxiliary vision client: none available") return None, None diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 66187d055..299d083f2 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -176,14 +176,18 @@ class TestVisionClientFallback: assert isinstance(client, CodexAuxiliaryClient) assert model == "gpt-5.3-codex" - def test_vision_auto_skips_custom_endpoint(self, monkeypatch): - """Custom endpoint is skipped in vision auto mode.""" + def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch): + """Custom endpoint is used as fallback in vision auto mode. + + Many local models (Qwen-VL, LLaVA, etc.) support vision. + When no OpenRouter/Nous/Codex is available, try the custom endpoint. + """ monkeypatch.setenv("OPENAI_BASE_URL", "http://localhost:1234/v1") monkeypatch.setenv("OPENAI_API_KEY", "local-key") - with patch("agent.auxiliary_client._read_nous_auth", return_value=None): + with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ + patch("agent.auxiliary_client.OpenAI") as mock_openai: client, model = get_vision_auxiliary_client() - assert client is None - assert model is None + assert client is not None # Custom endpoint picked up as fallback def test_vision_uses_openrouter_when_available(self, monkeypatch): monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")