feat: wire Gemma 4 vision into browser_tool for screenshots (#816)

Resolves #816. Updated _get_vision_model() to prefer Gemma 4 native multimodal for screenshot analysis. Priority chain: 1. AUXILIARY_VISION_MODEL (explicit override) 2. GEMMA_VISION_MODEL (Gemma 4 native multimodal) 3. OLLAMA_VISION_MODEL (local Ollama vision) 4. None (text-only fallback) Reduces latency by eliminating model switching for vision tasks. Backward compatible — existing AUXILIARY_VISION_MODEL still works.
2026-04-16 01:46:35 -04:00
parent 5022db9d7b
commit dc0a3d2024
1 changed files with 25 additions and 2 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -201,8 +201,31 @@ def _get_command_timeout() -> int:


 def _get_vision_model() -> Optional[str]:
-    """Model for browser_vision (screenshot analysis — multimodal)."""
-    return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
+    """Model for browser_vision (screenshot analysis — multimodal).
+
+    Priority:
+    1. AUXILIARY_VISION_MODEL env var (explicit override)
+    2. Gemma 4 (native multimodal, no model switching)
+    3. Ollama local vision models
+    4. None (fallback to text-only snapshot)
+    """
+    # Explicit override always wins
+    explicit = os.getenv("AUXILIARY_VISION_MODEL", "").strip()
+    if explicit:
+        return explicit
+
+    # Prefer Gemma 4 (native multimodal — no separate vision model needed)
+    gemma = os.getenv("GEMMA_VISION_MODEL", "").strip()
+    if gemma:
+        return gemma
+
+    # Check for Ollama vision models
+    ollama_vision = os.getenv("OLLAMA_VISION_MODEL", "").strip()
+    if ollama_vision:
+        return ollama_vision
+
+    # Default: None (text-only fallback)
+    return None


 def _get_extraction_model() -> Optional[str]: