fix(vision): auto-resize oversized images, increase default timeout, fix vision capability detection

Cherry-picked from PR #7749 by kshitijk4poor with modifications: - Raise hard image limit from 5 MB to 20 MB (matches most restrictive provider) - Send images at full resolution first; only auto-resize to 5 MB on API failure - Add _is_image_size_error() helper to detect size-related API rejections - Auto-resize uses Pillow (soft dep) with progressive downscale + JPEG quality reduction - Fix get_model_capabilities() to check modalities.input for vision support - Increase default vision timeout from 30s to 120s (matches hardcoded fallback intent) - Applied retry-with-resize to both vision_analyze_tool and browser_vision Closes #7740
2026-04-11 11:07:18 -07:00
parent 06e1d9cdd4
commit 50bb4fe010
6 changed files with 399 additions and 25 deletions
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit

    # Extract capability flags (default to False if missing)
    supports_tools = bool(entry.get("tool_call", False))
-    supports_vision = bool(entry.get("attachment", False))
+    # Vision: check both the `attachment` flag and `modalities.input` for "image".
+    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
+    input_mods = entry.get("modalities", {})
+    if isinstance(input_mods, dict):
+        input_mods = input_mods.get("input", [])
+    else:
+        input_mods = []
+    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
    supports_reasoning = bool(entry.get("reasoning", False))

    # Extract limits
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
            "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
            "base_url": "",        # direct OpenAI-compatible endpoint (takes precedence over provider)
            "api_key": "",         # API key for base_url (falls back to OPENAI_API_KEY)
-            "timeout": 30,         # seconds — LLM API call timeout; increase for slow local vision models
+            "timeout": 120,        # seconds — LLM API call timeout; vision payloads need generous timeout
            "download_timeout": 30,  # seconds — image HTTP download timeout; increase for slow connections
        },
        "web_extract": {
--- a/tests/agent/test_models_dev.py
+++ b/tests/agent/test_models_dev.py
@@ -7,6 +7,7 @@ from agent.models_dev import (
    PROVIDER_TO_MODELS_DEV,
    _extract_context,
    fetch_models_dev,
+    get_model_capabilities,
    lookup_models_dev_context,
 )

@@ -195,3 +196,88 @@ class TestFetchModelsDev:
        result = fetch_models_dev()
        mock_get.assert_not_called()
        assert result == SAMPLE_REGISTRY
+
+
+# ---------------------------------------------------------------------------
+# get_model_capabilities — vision via modalities.input
+# ---------------------------------------------------------------------------
+
+
+CAPS_REGISTRY = {
+    "google": {
+        "id": "google",
+        "models": {
+            "gemma-4-31b-it": {
+                "id": "gemma-4-31b-it",
+                "attachment": False,
+                "tool_call": True,
+                "modalities": {"input": ["text", "image"]},
+                "limit": {"context": 128000, "output": 8192},
+            },
+            "gemma-3-1b": {
+                "id": "gemma-3-1b",
+                "tool_call": True,
+                "limit": {"context": 32000, "output": 8192},
+            },
+        },
+    },
+    "anthropic": {
+        "id": "anthropic",
+        "models": {
+            "claude-sonnet-4": {
+                "id": "claude-sonnet-4",
+                "attachment": True,
+                "tool_call": True,
+                "limit": {"context": 200000, "output": 64000},
+            },
+        },
+    },
+}
+
+
+class TestGetModelCapabilities:
+    """Tests for get_model_capabilities vision detection."""
+
+    def test_vision_from_attachment_flag(self):
+        """Models with attachment=True should report supports_vision=True."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "claude-sonnet-4")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_vision_from_modalities_input_image(self):
+        """Models with 'image' in modalities.input but attachment=False should
+        still report supports_vision=True (the core fix in this PR)."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-4-31b-it")
+        assert caps is not None
+        assert caps.supports_vision is True
+
+    def test_no_vision_without_attachment_or_modalities(self):
+        """Models with neither attachment nor image modality should be non-vision."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "gemma-3-1b")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_modalities_non_dict_handled(self):
+        """Non-dict modalities field should not crash."""
+        registry = {
+            "google": {"id": "google", "models": {
+                "weird-model": {
+                    "id": "weird-model",
+                    "modalities": "text",  # not a dict
+                    "limit": {"context": 200000, "output": 8192},
+                },
+            }},
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=registry):
+            caps = get_model_capabilities("gemini", "weird-model")
+        assert caps is not None
+        assert caps.supports_vision is False
+
+    def test_model_not_found_returns_none(self):
+        """Unknown model should return None."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("anthropic", "nonexistent-model")
+        assert caps is None
--- a/tests/tools/test_vision_tools.py
+++ b/tests/tools/test_vision_tools.py
@@ -15,6 +15,10 @@ from tools.vision_tools import (
    _handle_vision_analyze,
    _determine_mime_type,
    _image_to_base64_data_url,
+    _resize_image_for_vision,
+    _is_image_size_error,
+    _MAX_BASE64_BYTES,
+    _RESIZE_TARGET_BYTES,
    vision_analyze_tool,
    check_vision_requirements,
    get_debug_session_info,
@@ -590,11 +594,13 @@ class TestBase64SizeLimit:

    @pytest.mark.asyncio
    async def test_oversized_image_rejected_before_api_call(self, tmp_path):
-        """Images exceeding 5 MB base64 should fail with a clear size error."""
+        """Images exceeding the 20 MB hard limit should fail with a clear error."""
        img = tmp_path / "huge.png"
        img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))

-        with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
+        # Patch the hard limit to a small value so the test runs fast.
+        with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
+             patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
            result = json.loads(await vision_analyze_tool(str(img), "describe this"))

        assert result["success"] is False
@@ -686,3 +692,124 @@ class TestVisionRegistration:

        entry = registry._tools.get("vision_analyze")
        assert callable(entry.handler)
+
+
+# ---------------------------------------------------------------------------
+# _resize_image_for_vision — auto-resize oversized images
+# ---------------------------------------------------------------------------
+
+
+class TestResizeImageForVision:
+    """Tests for the auto-resize function."""
+
+    def test_small_image_returned_as_is(self, tmp_path):
+        """Images under the limit should be returned unchanged."""
+        # Create a small 10x10 red PNG
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (10, 10), (255, 0, 0))
+        path = tmp_path / "small.png"
+        img.save(path, "PNG")
+
+        result = _resize_image_for_vision(path, mime_type="image/png")
+        assert result.startswith("data:image/png;base64,")
+        assert len(result) < _MAX_BASE64_BYTES
+
+    def test_large_image_is_resized(self, tmp_path):
+        """Images over the default target should be auto-resized to fit."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        # Create a large image that will exceed 5 MB in base64
+        # A 4000x4000 uncompressed PNG will be large
+        img = Image.new("RGB", (4000, 4000), (128, 200, 50))
+        path = tmp_path / "large.png"
+        img.save(path, "PNG")
+
+        result = _resize_image_for_vision(path, mime_type="image/png")
+        assert result.startswith("data:image/png;base64,")
+        # Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
+        assert len(result) <= _RESIZE_TARGET_BYTES
+
+    def test_custom_max_bytes(self, tmp_path):
+        """The max_base64_bytes parameter should be respected."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (200, 200), (0, 128, 255))
+        path = tmp_path / "medium.png"
+        img.save(path, "PNG")
+
+        # Set a very low limit to force resizing
+        result = _resize_image_for_vision(path, max_base64_bytes=500)
+        # Should still return a valid data URL
+        assert result.startswith("data:image/")
+
+    def test_jpeg_output_for_non_png(self, tmp_path):
+        """Non-PNG images should be resized as JPEG."""
+        try:
+            from PIL import Image
+        except ImportError:
+            pytest.skip("Pillow not installed")
+        img = Image.new("RGB", (2000, 2000), (255, 128, 0))
+        path = tmp_path / "photo.jpg"
+        img.save(path, "JPEG", quality=95)
+
+        result = _resize_image_for_vision(path, mime_type="image/jpeg",
+                                           max_base64_bytes=50_000)
+        assert result.startswith("data:image/jpeg;base64,")
+
+    def test_constants_sane(self):
+        """Hard limit should be larger than resize target."""
+        assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
+        assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
+        assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
+
+    def test_no_pillow_returns_original(self, tmp_path):
+        """Without Pillow, oversized images should be returned as-is."""
+        # Create a dummy file
+        path = tmp_path / "test.png"
+        # Write enough bytes to exceed a tiny limit
+        path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
+
+        with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
+            # Simulate a large base64 result
+            mock_b64.return_value = "data:image/png;base64," + "A" * 200
+            with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
+                result = _resize_image_for_vision(path, max_base64_bytes=100)
+                # Should return the original (oversized) data url
+                assert len(result) > 100
+
+
+# ---------------------------------------------------------------------------
+# _is_image_size_error — detect size-related API errors
+# ---------------------------------------------------------------------------
+
+
+class TestIsImageSizeError:
+    """Tests for the size-error detection helper."""
+
+    def test_too_large_message(self):
+        assert _is_image_size_error(Exception("Request payload too large"))
+
+    def test_413_status(self):
+        assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
+
+    def test_invalid_request(self):
+        assert _is_image_size_error(Exception("invalid_request_error: image too big"))
+
+    def test_exceeds_limit(self):
+        assert _is_image_size_error(Exception("Image exceeds maximum size"))
+
+    def test_unrelated_error(self):
+        assert not _is_image_size_error(Exception("Connection refused"))
+
+    def test_auth_error(self):
+        assert not _is_image_size_error(Exception("401 Unauthorized"))
+
+    def test_empty_message(self):
+        assert not _is_image_size_error(Exception(""))
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
                ),
            }, ensure_ascii=False)
        
-        # Read and convert to base64
-        image_data = screenshot_path.read_bytes()
-        image_base64 = base64.b64encode(image_data).decode("ascii")
-        data_url = f"data:image/png;base64,{image_base64}"
+        # Convert screenshot to base64 at full resolution.
+        _screenshot_bytes = screenshot_path.read_bytes()
+        _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
+        data_url = f"data:image/png;base64,{_screenshot_b64}"
        
        vision_prompt = (
            f"You are analyzing a screenshot of a web browser.\n\n"
@@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        # Use the centralized LLM router
        vision_model = _get_vision_model()
        logger.debug("browser_vision: analysing screenshot (%d bytes)",
-                     len(image_data))
+                     len(_screenshot_bytes))

        # Read vision timeout from config (auxiliary.vision.timeout), default 120s.
        # Local vision models (llama.cpp, ollama) can take well over 30s for
@@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        }
        if vision_model:
            call_kwargs["model"] = vision_model
-        response = call_llm(**call_kwargs)
+        # Try full-size screenshot; on size-related rejection, downscale and retry.
+        try:
+            response = call_llm(**call_kwargs)
+        except Exception as _api_err:
+            from tools.vision_tools import (
+                _is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
+            )
+            if (_is_image_size_error(_api_err)
+                    and len(data_url) > _RESIZE_TARGET_BYTES):
+                logger.info(
+                    "Vision API rejected screenshot (%.1f MB); "
+                    "auto-resizing to ~%.0f MB and retrying...",
+                    len(data_url) / (1024 * 1024),
+                    _RESIZE_TARGET_BYTES / (1024 * 1024),
+                )
+                data_url = _resize_image_for_vision(
+                    screenshot_path, mime_type="image/png")
+                call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
+                response = call_llm(**call_kwargs)
+            else:
+                raise
        
        analysis = (response.choices[0].message.content or "").strip()
        # Redact secrets the vision LLM may have read from the screenshot.
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
    return data_url


+# Hard limit for vision API payloads (20 MB) — matches the most restrictive
+# major provider (Gemini inline data limit).  Images above this are rejected.
+_MAX_BASE64_BYTES = 20 * 1024 * 1024
+
+# Target size when auto-resizing on API failure (5 MB).  After a provider
+# rejects an image, we downscale to this target and retry once.
+_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
+
+
+def _is_image_size_error(error: Exception) -> bool:
+    """Detect if an API error is related to image or payload size."""
+    err_str = str(error).lower()
+    return any(hint in err_str for hint in (
+        "too large", "payload", "413", "content_too_large",
+        "request_too_large", "image_url", "invalid_request",
+        "exceeds", "size limit",
+    ))
+
+
+def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
+                              max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
+    """Convert an image to a base64 data URL, auto-resizing if too large.
+
+    Tries Pillow first to progressively downscale oversized images.  If Pillow
+    is not installed or resizing still exceeds the limit, falls back to the raw
+    bytes and lets the caller handle the size check.
+
+    Returns the base64 data URL string.
+    """
+    # Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
+    # Skip the expensive full-read + encode if Pillow can resize directly.
+    file_size = image_path.stat().st_size
+    estimated_b64 = (file_size * 4) // 3 + 100  # ~header overhead
+    if estimated_b64 <= max_base64_bytes:
+        # Small enough — just encode directly.
+        data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        if len(data_url) <= max_base64_bytes:
+            return data_url
+    else:
+        data_url = None  # defer full encode; try Pillow resize first
+
+    # Attempt auto-resize with Pillow (soft dependency)
+    try:
+        from PIL import Image
+        import io as _io
+    except ImportError:
+        logger.info("Pillow not installed — cannot auto-resize oversized image")
+        if data_url is None:
+            data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        return data_url  # caller will raise the size error
+
+    logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
+                file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
+                max_base64_bytes / (1024 * 1024))
+
+    mime = mime_type or _determine_mime_type(image_path)
+    # Choose output format: JPEG for photos (smaller), PNG for transparency
+    pil_format = "PNG" if mime == "image/png" else "JPEG"
+    out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
+
+    try:
+        img = Image.open(image_path)
+    except Exception as exc:
+        logger.info("Pillow cannot open image for resizing: %s", exc)
+        if data_url is None:
+            data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
+        return data_url  # fall through to size-check in caller
+    # Convert RGBA to RGB for JPEG output
+    if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
+        img = img.convert("RGB")
+
+    # Strategy: halve dimensions until base64 fits, up to 4 rounds.
+    # For JPEG, also try reducing quality at each size step.
+    # For PNG, quality is irrelevant — only dimension reduction helps.
+    quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
+    prev_dims = (img.width, img.height)
+    candidate = None  # will be set on first loop iteration
+
+    for attempt in range(5):
+        if attempt > 0:
+            new_w = max(img.width // 2, 64)
+            new_h = max(img.height // 2, 64)
+            # Stop if dimensions can't shrink further
+            if (new_w, new_h) == prev_dims:
+                break
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+            prev_dims = (new_w, new_h)
+            logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
+
+        for q in quality_steps:
+            buf = _io.BytesIO()
+            save_kwargs = {"format": pil_format}
+            if q is not None:
+                save_kwargs["quality"] = q
+            img.save(buf, **save_kwargs)
+            encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+            candidate = f"data:{out_mime};base64,{encoded}"
+            if len(candidate) <= max_base64_bytes:
+                logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
+                            len(candidate) / (1024 * 1024), q,
+                            img.width, img.height)
+                return candidate
+
+    # If we still can't get it small enough, return the best attempt
+    # and let the caller decide
+    if candidate is not None:
+        logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
+                       max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
+        return candidate
+
+    # Shouldn't reach here, but fall back to full encode
+    return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
+
+
 async def vision_analyze_tool(
    image_url: str,
    user_prompt: str,
@@ -376,24 +490,27 @@ async def vision_analyze_tool(
        if not detected_mime_type:
            raise ValueError("Only real image files are supported for vision analysis.")
        
-        # Convert image to base64 data URL
+        # Convert image to base64 — send at full resolution first.
+        # If the provider rejects it as too large, we auto-resize and retry.
        logger.info("Converting image to base64...")
        image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
-        # Calculate size in KB for better readability
        data_size_kb = len(image_data_url) / 1024
        logger.info("Image converted to base64 (%.1f KB)", data_size_kb)

-        # Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
-        # Reject early with a clear message instead of a cryptic provider 400.
-        _MAX_BASE64_BYTES = 5 * 1024 * 1024  # 5 MB
-        # The data URL includes the header (e.g. "data:image/jpeg;base64,") which
-        # is negligible, but measure the full string to be safe.
+        # Hard limit (20 MB) — no provider accepts payloads this large.
        if len(image_data_url) > _MAX_BASE64_BYTES:
-            raise ValueError(
-                f"Image too large for vision API: base64 payload is "
-                f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
-                f"Resize or compress the image and try again."
-            )
+            # Try to resize down to 5 MB before giving up.
+            image_data_url = _resize_image_for_vision(
+                temp_image_path, mime_type=detected_mime_type)
+            if len(image_data_url) > _MAX_BASE64_BYTES:
+                raise ValueError(
+                    f"Image too large for vision API: base64 payload is "
+                    f"{len(image_data_url) / (1024 * 1024):.1f} MB "
+                    f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
+                    f"even after resizing. "
+                    f"Install Pillow (`pip install Pillow`) for better auto-resize, "
+                    f"or compress the image manually."
+                )

        debug_call_data["image_size_bytes"] = image_size_bytes
        
@@ -442,7 +559,24 @@ async def vision_analyze_tool(
        }
        if model:
            call_kwargs["model"] = model
-        response = await async_call_llm(**call_kwargs)
+        # Try full-size image first; on size-related rejection, downscale and retry.
+        try:
+            response = await async_call_llm(**call_kwargs)
+        except Exception as _api_err:
+            if (_is_image_size_error(_api_err)
+                    and len(image_data_url) > _RESIZE_TARGET_BYTES):
+                logger.info(
+                    "API rejected image (%.1f MB, likely too large); "
+                    "auto-resizing to ~%.0f MB and retrying...",
+                    len(image_data_url) / (1024 * 1024),
+                    _RESIZE_TARGET_BYTES / (1024 * 1024),
+                )
+                image_data_url = _resize_image_for_vision(
+                    temp_image_path, mime_type=detected_mime_type)
+                messages[0]["content"][1]["image_url"]["url"] = image_data_url
+                response = await async_call_llm(**call_kwargs)
+            else:
+                raise
        
        # Extract the analysis — fall back to reasoning if content is empty
        analysis = extract_content_or_reasoning(response)
@@ -498,8 +632,8 @@ async def vision_analyze_tool(
        elif "invalid_request" in err_str or "image_url" in err_str:
            analysis = (
                "The vision API rejected the image. This can happen when the "
-                "image is too large, in an unsupported format, or corrupted. "
-                "Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
+                "image is in an unsupported format, corrupted, or still too "
+                "large after auto-resize. Try a smaller JPEG/PNG and retry. "
                f"Error: {e}"
            )
        else: