diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 03be84e02..9f96acebe 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -201,8 +201,31 @@ def _get_command_timeout() -> int: def _get_vision_model() -> Optional[str]: - """Model for browser_vision (screenshot analysis — multimodal).""" - return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + """Model for browser_vision (screenshot analysis — multimodal). + + Priority: + 1. AUXILIARY_VISION_MODEL env var (explicit override) + 2. Gemma 4 (native multimodal, no model switching) + 3. Ollama local vision models + 4. None (fallback to text-only snapshot) + """ + # Explicit override always wins + explicit = os.getenv("AUXILIARY_VISION_MODEL", "").strip() + if explicit: + return explicit + + # Prefer Gemma 4 (native multimodal — no separate vision model needed) + gemma = os.getenv("GEMMA_VISION_MODEL", "").strip() + if gemma: + return gemma + + # Check for Ollama vision models + ollama_vision = os.getenv("OLLAMA_VISION_MODEL", "").strip() + if ollama_vision: + return ollama_vision + + # Default: None (text-only fallback) + return None def _get_extraction_model() -> Optional[str]: