diff --git a/cli-config.yaml.example b/cli-config.yaml.example index b1d7a67a9..0ccf75881 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -360,6 +360,7 @@ compression: # # Defaults to Gemma 4 27B — natively multimodal, same model family as the main # # text model, which avoids model-switching overhead and improves context continuity. # # Override with any vision-capable model. Set to "" to fall back to auto-detection. +# # Can also be overridden per-session with BROWSER_VISION_MODEL env var. # browser_vision: # model: "google/gemma-4-27b-it" # default; override e.g. "google/gemini-2.5-flash" # diff --git a/tests/tools/test_browser_vision_model.py b/tests/tools/test_browser_vision_model.py index e565da89e..592de3a32 100644 --- a/tests/tools/test_browser_vision_model.py +++ b/tests/tools/test_browser_vision_model.py @@ -1,4 +1,12 @@ -"""Tests for browser_tool._get_vision_model() — Gemma 4 default (Issue #816).""" +"""Tests for browser_tool._get_vision_model() — Gemma 4 default (Issue #816). + +Covers acceptance criteria from issue #816: +- Browser screenshots use Gemma 4 by default. +- BROWSER_VISION_MODEL env var overrides the model for browser vision only. +- AUXILIARY_VISION_MODEL env var still works as a global override. +- auxiliary.browser_vision.model in config.yaml overrides the default. +- Priority: BROWSER_VISION_MODEL > config.yaml > AUXILIARY_VISION_MODEL > default. +""" import importlib import sys @@ -96,3 +104,26 @@ class TestGetVisionModelConfig: with patch("hermes_cli.config.load_config", return_value=cfg): import tools.browser_tool as bt assert bt._get_vision_model() == "openai/gpt-4o" + + def test_config_beats_auxiliary_vision_model(self, monkeypatch): + """Config should override AUXILIARY_VISION_MODEL when BROWSER_VISION_MODEL unset.""" + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "global-override") + cfg = {"auxiliary": {"browser_vision": {"model": "config-model"}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + import tools.browser_tool as bt + assert bt._get_vision_model() == "config-model" + + +class TestBackwardCompatibility: + """AUXILIARY_VISION_MODEL must still work for users who already have it configured.""" + + def test_existing_auxiliary_vision_model_not_broken(self, monkeypatch): + """Users who set AUXILIARY_VISION_MODEL must not be broken by this change.""" + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "openai/gpt-4o") + import tools.browser_tool as bt + with patch("hermes_cli.config.load_config", return_value={}): + model = bt._get_vision_model() + assert model == "openai/gpt-4o" + assert model != "google/gemma-4-27b-it" diff --git a/tools/browser_tool.py b/tools/browser_tool.py index e2150c078..283549877 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -806,7 +806,7 @@ BROWSER_TOOL_SCHEMAS = [ }, { "name": "browser_vision", - "description": "Take a screenshot of the current page and analyze it with vision AI. Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA: in your response. Requires browser_navigate to be called first.", + "description": "Take a screenshot of the current page and analyze it with vision AI (default: Gemma 4 multimodal). Use this when you need to visually understand what's on the page - especially useful for CAPTCHAs, visual verification challenges, complex layouts, or when the text snapshot doesn't capture important visual information. Returns both the AI analysis and a screenshot_path that you can share with the user by including MEDIA: in your response. Requires browser_navigate to be called first. Vision model can be overridden via BROWSER_VISION_MODEL env var or auxiliary.browser_vision.model in config.yaml.", "parameters": { "type": "object", "properties": {