diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 4d2331548..13735a61c 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -2302,7 +2302,7 @@ def call_llm( resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model( task, provider, model, base_url, api_key) - if task == "vision": + if task in ("vision", "browser_vision"): effective_provider, client, final_model = resolve_vision_provider_client( provider=provider, model=model, diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 0ccf75881..a4305f379 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -363,6 +363,7 @@ compression: # # Can also be overridden per-session with BROWSER_VISION_MODEL env var. # browser_vision: # model: "google/gemma-4-27b-it" # default; override e.g. "google/gemini-2.5-flash" +# timeout: 120 # API call timeout in seconds (default 120s) # # # Web page scraping / summarization + browser page text extraction # web_extract: diff --git a/tests/tools/test_browser_vision_gemma4.py b/tests/tools/test_browser_vision_gemma4.py new file mode 100644 index 000000000..bc43a9b51 --- /dev/null +++ b/tests/tools/test_browser_vision_gemma4.py @@ -0,0 +1,82 @@ +"""Tests for task routing and timeout config — browser_vision Gemma 4 (Issue #816). + +Covers the additional wiring on top of the Gemma 4 default: + - browser_vision() uses task="browser_vision" so auxiliary.browser_vision.* + config is consulted for provider/model/timeout + - call_llm() routes "browser_vision" through vision provider resolution + (same path as "vision" task) + - Timeout is read from auxiliary.browser_vision.timeout before + auxiliary.vision.timeout + +Model selection tests are in test_browser_vision_model.py. +""" + +import inspect + + +# ── browser_vision() task routing ──────────────────────────────────────────── + +class TestBrowserVisionTaskRouting: + """browser_vision() must use task='browser_vision' in call_llm().""" + + def test_call_llm_receives_browser_vision_task(self): + """browser_vision() source uses task='browser_vision', not 'vision'.""" + src = inspect.getsource( + __import__("tools.browser_tool", fromlist=["browser_vision"]).browser_vision + ) + assert '"browser_vision"' in src or "'browser_vision'" in src, ( + "browser_vision() must pass task='browser_vision' to call_llm(), not 'vision'" + ) + + def test_call_llm_does_not_use_bare_vision_task(self): + """The call_llm() invocation must not use task='vision' for browser screenshots.""" + import re + src = inspect.getsource( + __import__("tools.browser_tool", fromlist=["browser_vision"]).browser_vision + ) + call_llm_blocks = re.findall(r'call_llm\s*\([^)]+\)', src, re.DOTALL) + for block in call_llm_blocks: + assert '"vision"' not in block and "'vision'" not in block, ( + f"call_llm() must use task='browser_vision', found 'vision' in: {block}" + ) + + +# ── call_llm() vision routing ──────────────────────────────────────────────── + +class TestCallLlmBrowserVisionRouting: + """call_llm(task='browser_vision') must route through vision provider path.""" + + def test_browser_vision_task_in_vision_branch(self): + """call_llm() source handles 'browser_vision' in the same branch as 'vision'.""" + from agent import auxiliary_client + src = inspect.getsource(auxiliary_client.call_llm) + assert 'task in ("vision", "browser_vision")' in src or \ + "task in ('vision', 'browser_vision')" in src, ( + "call_llm() should route 'browser_vision' through the vision provider path" + ) + + +# ── timeout resolution ──────────────────────────────────────────────────────── + +class TestBrowserVisionTimeoutResolution: + """browser_vision() reads auxiliary.browser_vision.timeout first.""" + + def test_browser_vision_timeout_checked_before_vision_timeout(self): + """Source checks auxiliary.browser_vision.timeout before auxiliary.vision.timeout.""" + src = inspect.getsource( + __import__("tools.browser_tool", fromlist=["browser_vision"]).browser_vision + ) + # Locate the timeout resolution block (before call_kwargs dict) + timeout_block_start = src.find("vision_timeout") + call_kwargs_start = src.find('"task": "browser_vision"') + assert timeout_block_start != -1, "Could not find vision_timeout in browser_vision source" + assert call_kwargs_start != -1, "Could not find task='browser_vision' in browser_vision source" + + # The timeout block should mention "browser_vision" before "vision" + block = src[timeout_block_start:call_kwargs_start] + bv_idx = block.find('"browser_vision"') + v_idx = block.find('"vision"') + if bv_idx != -1 and v_idx != -1: + assert bv_idx < v_idx, ( + "auxiliary.browser_vision.timeout should be checked before auxiliary.vision.timeout" + ) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 283549877..1d2b3ac54 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -2033,21 +2033,25 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] logger.debug("browser_vision: analysing screenshot (%d bytes)", len(_screenshot_bytes)) - # Read vision timeout from config (auxiliary.vision.timeout), default 120s. - # Local vision models (llama.cpp, ollama) can take well over 30s for - # screenshot analysis, so the default must be generous. + # Read vision timeout from config (auxiliary.browser_vision.timeout, then + # auxiliary.vision.timeout), default 120s. Local vision models can take + # well over 30s for screenshot analysis, so the default must be generous. vision_timeout = 120.0 try: from hermes_cli.config import load_config _cfg = load_config() - _vt = _cfg.get("auxiliary", {}).get("vision", {}).get("timeout") + _aux = _cfg.get("auxiliary", {}) if isinstance(_cfg, dict) else {} + _vt = ( + (_aux.get("browser_vision") or {}).get("timeout") + or (_aux.get("vision") or {}).get("timeout") + ) if _vt is not None: vision_timeout = float(_vt) except Exception: pass call_kwargs = { - "task": "vision", + "task": "browser_vision", "messages": [ { "role": "user",