diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 657423679..b1d7a67a9 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -348,7 +348,7 @@ compression: # Other providers pick a sensible default automatically. # # auxiliary: -# # Image analysis: vision_analyze tool + browser screenshots +# # Image analysis: vision_analyze tool # vision: # provider: "auto" # model: "" # e.g. "google/gemini-2.5-flash", "openai/gpt-4o" @@ -356,6 +356,13 @@ compression: # download_timeout: 30 # Image HTTP download timeout (seconds) # # Increase for slow connections or self-hosted image servers # +# # Browser screenshot analysis (browser_vision tool) +# # Defaults to Gemma 4 27B — natively multimodal, same model family as the main +# # text model, which avoids model-switching overhead and improves context continuity. +# # Override with any vision-capable model. Set to "" to fall back to auto-detection. +# browser_vision: +# model: "google/gemma-4-27b-it" # default; override e.g. "google/gemini-2.5-flash" +# # # Web page scraping / summarization + browser page text extraction # web_extract: # provider: "auto" diff --git a/tests/tools/test_browser_vision_model.py b/tests/tools/test_browser_vision_model.py new file mode 100644 index 000000000..e565da89e --- /dev/null +++ b/tests/tools/test_browser_vision_model.py @@ -0,0 +1,98 @@ +"""Tests for browser_tool._get_vision_model() — Gemma 4 default (Issue #816).""" + +import importlib +import sys +from unittest.mock import patch, MagicMock + +import pytest + + +def _reload_and_get(monkeypatch, env: dict, cfg: dict | None = None): + """Reload browser_tool with patched env vars and optional config, return _get_vision_model.""" + # Patch environment + for k, v in env.items(): + monkeypatch.setenv(k, v) + + # Patch load_config if a config dict is provided + if cfg is not None: + mock_load = MagicMock(return_value=cfg) + monkeypatch.setattr("hermes_cli.config.load_config", mock_load, raising=False) + + # We import the function directly rather than reloading the heavy module + # to keep tests fast. Import browser_tool once and call the function with + # patched globals each time. + import tools.browser_tool as bt + return bt._get_vision_model + + +class TestGetVisionModelDefault: + def test_default_is_gemma4(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) + import tools.browser_tool as bt + model = bt._get_vision_model() + assert model == "google/gemma-4-27b-it" + + def test_default_constant(self): + import tools.browser_tool as bt + assert bt._BROWSER_VISION_DEFAULT_MODEL == "google/gemma-4-27b-it" + + +class TestGetVisionModelEnvOverrides: + def test_browser_vision_model_env_takes_priority(self, monkeypatch): + monkeypatch.setenv("BROWSER_VISION_MODEL", "openai/gpt-4o") + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "google/gemini-3-flash-preview") + import tools.browser_tool as bt + assert bt._get_vision_model() == "openai/gpt-4o" + + def test_auxiliary_vision_model_fallback(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "google/gemini-3-flash-preview") + import tools.browser_tool as bt + assert bt._get_vision_model() == "google/gemini-3-flash-preview" + + def test_browser_vision_model_empty_falls_through(self, monkeypatch): + """Empty BROWSER_VISION_MODEL should fall through to next step.""" + monkeypatch.setenv("BROWSER_VISION_MODEL", "") + monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) + import tools.browser_tool as bt + # Should reach the default + assert bt._get_vision_model() == "google/gemma-4-27b-it" + + def test_auxiliary_vision_model_empty_falls_through(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.setenv("AUXILIARY_VISION_MODEL", "") + import tools.browser_tool as bt + assert bt._get_vision_model() == "google/gemma-4-27b-it" + + +class TestGetVisionModelConfig: + def test_config_overrides_default(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) + cfg = {"auxiliary": {"browser_vision": {"model": "anthropic/claude-3-5-haiku"}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + import tools.browser_tool as bt + assert bt._get_vision_model() == "anthropic/claude-3-5-haiku" + + def test_config_empty_string_falls_through_to_default(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) + cfg = {"auxiliary": {"browser_vision": {"model": ""}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + import tools.browser_tool as bt + assert bt._get_vision_model() == "google/gemma-4-27b-it" + + def test_config_load_error_falls_through_to_default(self, monkeypatch): + monkeypatch.delenv("BROWSER_VISION_MODEL", raising=False) + monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) + with patch("hermes_cli.config.load_config", side_effect=Exception("config error")): + import tools.browser_tool as bt + assert bt._get_vision_model() == "google/gemma-4-27b-it" + + def test_env_beats_config(self, monkeypatch): + monkeypatch.setenv("BROWSER_VISION_MODEL", "openai/gpt-4o") + cfg = {"auxiliary": {"browser_vision": {"model": "anthropic/claude-3-5-haiku"}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + import tools.browser_tool as bt + assert bt._get_vision_model() == "openai/gpt-4o" diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 03be84e02..9adc20c7e 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -200,9 +200,50 @@ def _get_command_timeout() -> int: return result +# Default vision model for browser screenshot analysis. +# Gemma 4 is natively multimodal so it can analyze screenshots using the same +# model already loaded for text tasks, reducing cold-start latency. +_BROWSER_VISION_DEFAULT_MODEL = "google/gemma-4-27b-it" + + def _get_vision_model() -> Optional[str]: - """Model for browser_vision (screenshot analysis — multimodal).""" - return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None + """Model for browser_vision (screenshot analysis — multimodal). + + Resolution order (first non-empty value wins): + 1. ``BROWSER_VISION_MODEL`` env var — browser-specific override + 2. ``auxiliary.browser_vision.model`` in config.yaml + 3. ``AUXILIARY_VISION_MODEL`` env var — shared vision override + 4. ``_BROWSER_VISION_DEFAULT_MODEL`` — Gemma 4 27B (default) + + Set ``BROWSER_VISION_MODEL`` or ``auxiliary.browser_vision.model`` to an + empty string to force the auxiliary router's auto-detection (no default). + """ + # 1. Browser-specific env var + env_browser = os.getenv("BROWSER_VISION_MODEL", "").strip() + if env_browser: + return env_browser + + # 2. Config file: auxiliary.browser_vision.model + try: + from hermes_cli.config import load_config + _cfg = load_config() + cfg_model = ( + _cfg.get("auxiliary", {}) + .get("browser_vision", {}) + .get("model", "") + ) + if cfg_model and str(cfg_model).strip(): + return str(cfg_model).strip() + except Exception: + pass + + # 3. Shared vision env var (backward-compat) + env_shared = os.getenv("AUXILIARY_VISION_MODEL", "").strip() + if env_shared: + return env_shared + + # 4. Default: Gemma 4 27B + return _BROWSER_VISION_DEFAULT_MODEL def _get_extraction_model() -> Optional[str]: @@ -1893,20 +1934,23 @@ def browser_get_images(task_id: Optional[str] = None) -> str: def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: """ Take a screenshot of the current page and analyze it with vision AI. - - This tool captures what's visually displayed in the browser and sends it - to Gemini for analysis. Useful for understanding visual content that the - text-based snapshot may not capture (CAPTCHAs, verification challenges, - images, complex layouts, etc.). - + + Uses Gemma 4 27B by default (natively multimodal — same model family as the + main text model, lower cold-start latency than switching to a separate vision + model). Override via ``BROWSER_VISION_MODEL`` env var or + ``auxiliary.browser_vision.model`` in config.yaml. + + Useful for understanding visual content that the text-based snapshot may not + capture (CAPTCHAs, verification challenges, images, complex layouts, etc.). + The screenshot is saved persistently and its file path is returned alongside the analysis, so it can be shared with users via MEDIA: in the response. - + Args: question: What you want to know about the page visually annotate: If True, overlay numbered [N] labels on interactive elements task_id: Task identifier for session isolation - + Returns: JSON string with vision analysis results and screenshot_path """