fix(vision): auto-resize oversized images, increase default timeout, fix vision capability detection

Cherry-picked from PR #7749 by kshitijk4poor with modifications:

- Raise hard image limit from 5 MB to 20 MB (matches most restrictive provider)
- Send images at full resolution first; only auto-resize to 5 MB on API failure
- Add _is_image_size_error() helper to detect size-related API rejections
- Auto-resize uses Pillow (soft dep) with progressive downscale + JPEG quality reduction
- Fix get_model_capabilities() to check modalities.input for vision support
- Increase default vision timeout from 30s to 120s (matches hardcoded fallback intent)
- Applied retry-with-resize to both vision_analyze_tool and browser_vision

Closes #7740
This commit is contained in:
kshitijk4poor
2026-04-11 11:07:18 -07:00
committed by Teknium
parent 06e1d9cdd4
commit 50bb4fe010
6 changed files with 399 additions and 25 deletions

View File

@@ -383,7 +383,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
# Extract capability flags (default to False if missing)
supports_tools = bool(entry.get("tool_call", False))
supports_vision = bool(entry.get("attachment", False))
# Vision: check both the `attachment` flag and `modalities.input` for "image".
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
input_mods = entry.get("modalities", {})
if isinstance(input_mods, dict):
input_mods = input_mods.get("input", [])
else:
input_mods = []
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
supports_reasoning = bool(entry.get("reasoning", False))
# Extract limits

View File

@@ -381,7 +381,7 @@ DEFAULT_CONFIG = {
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
"base_url": "", # direct OpenAI-compatible endpoint (takes precedence over provider)
"api_key": "", # API key for base_url (falls back to OPENAI_API_KEY)
"timeout": 30, # seconds — LLM API call timeout; increase for slow local vision models
"timeout": 120, # seconds — LLM API call timeout; vision payloads need generous timeout
"download_timeout": 30, # seconds — image HTTP download timeout; increase for slow connections
},
"web_extract": {

View File

@@ -7,6 +7,7 @@ from agent.models_dev import (
PROVIDER_TO_MODELS_DEV,
_extract_context,
fetch_models_dev,
get_model_capabilities,
lookup_models_dev_context,
)
@@ -195,3 +196,88 @@ class TestFetchModelsDev:
result = fetch_models_dev()
mock_get.assert_not_called()
assert result == SAMPLE_REGISTRY
# ---------------------------------------------------------------------------
# get_model_capabilities — vision via modalities.input
# ---------------------------------------------------------------------------
CAPS_REGISTRY = {
"google": {
"id": "google",
"models": {
"gemma-4-31b-it": {
"id": "gemma-4-31b-it",
"attachment": False,
"tool_call": True,
"modalities": {"input": ["text", "image"]},
"limit": {"context": 128000, "output": 8192},
},
"gemma-3-1b": {
"id": "gemma-3-1b",
"tool_call": True,
"limit": {"context": 32000, "output": 8192},
},
},
},
"anthropic": {
"id": "anthropic",
"models": {
"claude-sonnet-4": {
"id": "claude-sonnet-4",
"attachment": True,
"tool_call": True,
"limit": {"context": 200000, "output": 64000},
},
},
},
}
class TestGetModelCapabilities:
"""Tests for get_model_capabilities vision detection."""
def test_vision_from_attachment_flag(self):
"""Models with attachment=True should report supports_vision=True."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("anthropic", "claude-sonnet-4")
assert caps is not None
assert caps.supports_vision is True
def test_vision_from_modalities_input_image(self):
"""Models with 'image' in modalities.input but attachment=False should
still report supports_vision=True (the core fix in this PR)."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("google", "gemma-4-31b-it")
assert caps is not None
assert caps.supports_vision is True
def test_no_vision_without_attachment_or_modalities(self):
"""Models with neither attachment nor image modality should be non-vision."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("google", "gemma-3-1b")
assert caps is not None
assert caps.supports_vision is False
def test_modalities_non_dict_handled(self):
"""Non-dict modalities field should not crash."""
registry = {
"google": {"id": "google", "models": {
"weird-model": {
"id": "weird-model",
"modalities": "text", # not a dict
"limit": {"context": 200000, "output": 8192},
},
}},
}
with patch("agent.models_dev.fetch_models_dev", return_value=registry):
caps = get_model_capabilities("gemini", "weird-model")
assert caps is not None
assert caps.supports_vision is False
def test_model_not_found_returns_none(self):
"""Unknown model should return None."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("anthropic", "nonexistent-model")
assert caps is None

View File

@@ -15,6 +15,10 @@ from tools.vision_tools import (
_handle_vision_analyze,
_determine_mime_type,
_image_to_base64_data_url,
_resize_image_for_vision,
_is_image_size_error,
_MAX_BASE64_BYTES,
_RESIZE_TARGET_BYTES,
vision_analyze_tool,
check_vision_requirements,
get_debug_session_info,
@@ -590,11 +594,13 @@ class TestBase64SizeLimit:
@pytest.mark.asyncio
async def test_oversized_image_rejected_before_api_call(self, tmp_path):
"""Images exceeding 5 MB base64 should fail with a clear size error."""
"""Images exceeding the 20 MB hard limit should fail with a clear error."""
img = tmp_path / "huge.png"
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * (4 * 1024 * 1024))
with patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
# Patch the hard limit to a small value so the test runs fast.
with patch("tools.vision_tools._MAX_BASE64_BYTES", 1000), \
patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock) as mock_llm:
result = json.loads(await vision_analyze_tool(str(img), "describe this"))
assert result["success"] is False
@@ -686,3 +692,124 @@ class TestVisionRegistration:
entry = registry._tools.get("vision_analyze")
assert callable(entry.handler)
# ---------------------------------------------------------------------------
# _resize_image_for_vision — auto-resize oversized images
# ---------------------------------------------------------------------------
class TestResizeImageForVision:
"""Tests for the auto-resize function."""
def test_small_image_returned_as_is(self, tmp_path):
"""Images under the limit should be returned unchanged."""
# Create a small 10x10 red PNG
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (10, 10), (255, 0, 0))
path = tmp_path / "small.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png")
assert result.startswith("data:image/png;base64,")
assert len(result) < _MAX_BASE64_BYTES
def test_large_image_is_resized(self, tmp_path):
"""Images over the default target should be auto-resized to fit."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
# Create a large image that will exceed 5 MB in base64
# A 4000x4000 uncompressed PNG will be large
img = Image.new("RGB", (4000, 4000), (128, 200, 50))
path = tmp_path / "large.png"
img.save(path, "PNG")
result = _resize_image_for_vision(path, mime_type="image/png")
assert result.startswith("data:image/png;base64,")
# Default target is _RESIZE_TARGET_BYTES (5 MB), not _MAX_BASE64_BYTES (20 MB)
assert len(result) <= _RESIZE_TARGET_BYTES
def test_custom_max_bytes(self, tmp_path):
"""The max_base64_bytes parameter should be respected."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (200, 200), (0, 128, 255))
path = tmp_path / "medium.png"
img.save(path, "PNG")
# Set a very low limit to force resizing
result = _resize_image_for_vision(path, max_base64_bytes=500)
# Should still return a valid data URL
assert result.startswith("data:image/")
def test_jpeg_output_for_non_png(self, tmp_path):
"""Non-PNG images should be resized as JPEG."""
try:
from PIL import Image
except ImportError:
pytest.skip("Pillow not installed")
img = Image.new("RGB", (2000, 2000), (255, 128, 0))
path = tmp_path / "photo.jpg"
img.save(path, "JPEG", quality=95)
result = _resize_image_for_vision(path, mime_type="image/jpeg",
max_base64_bytes=50_000)
assert result.startswith("data:image/jpeg;base64,")
def test_constants_sane(self):
"""Hard limit should be larger than resize target."""
assert _MAX_BASE64_BYTES == 20 * 1024 * 1024
assert _RESIZE_TARGET_BYTES == 5 * 1024 * 1024
assert _MAX_BASE64_BYTES > _RESIZE_TARGET_BYTES
def test_no_pillow_returns_original(self, tmp_path):
"""Without Pillow, oversized images should be returned as-is."""
# Create a dummy file
path = tmp_path / "test.png"
# Write enough bytes to exceed a tiny limit
path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 1000)
with patch("tools.vision_tools._image_to_base64_data_url") as mock_b64:
# Simulate a large base64 result
mock_b64.return_value = "data:image/png;base64," + "A" * 200
with patch.dict("sys.modules", {"PIL": None, "PIL.Image": None}):
result = _resize_image_for_vision(path, max_base64_bytes=100)
# Should return the original (oversized) data url
assert len(result) > 100
# ---------------------------------------------------------------------------
# _is_image_size_error — detect size-related API errors
# ---------------------------------------------------------------------------
class TestIsImageSizeError:
"""Tests for the size-error detection helper."""
def test_too_large_message(self):
assert _is_image_size_error(Exception("Request payload too large"))
def test_413_status(self):
assert _is_image_size_error(Exception("HTTP 413 Payload Too Large"))
def test_invalid_request(self):
assert _is_image_size_error(Exception("invalid_request_error: image too big"))
def test_exceeds_limit(self):
assert _is_image_size_error(Exception("Image exceeds maximum size"))
def test_unrelated_error(self):
assert not _is_image_size_error(Exception("Connection refused"))
def test_auth_error(self):
assert not _is_image_size_error(Exception("401 Unauthorized"))
def test_empty_message(self):
assert not _is_image_size_error(Exception(""))

View File

@@ -1873,10 +1873,10 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
),
}, ensure_ascii=False)
# Read and convert to base64
image_data = screenshot_path.read_bytes()
image_base64 = base64.b64encode(image_data).decode("ascii")
data_url = f"data:image/png;base64,{image_base64}"
# Convert screenshot to base64 at full resolution.
_screenshot_bytes = screenshot_path.read_bytes()
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
data_url = f"data:image/png;base64,{_screenshot_b64}"
vision_prompt = (
f"You are analyzing a screenshot of a web browser.\n\n"
@@ -1890,7 +1890,7 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
# Use the centralized LLM router
vision_model = _get_vision_model()
logger.debug("browser_vision: analysing screenshot (%d bytes)",
len(image_data))
len(_screenshot_bytes))
# Read vision timeout from config (auxiliary.vision.timeout), default 120s.
# Local vision models (llama.cpp, ollama) can take well over 30s for
@@ -1922,7 +1922,27 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
}
if vision_model:
call_kwargs["model"] = vision_model
response = call_llm(**call_kwargs)
# Try full-size screenshot; on size-related rejection, downscale and retry.
try:
response = call_llm(**call_kwargs)
except Exception as _api_err:
from tools.vision_tools import (
_is_image_size_error, _resize_image_for_vision, _RESIZE_TARGET_BYTES,
)
if (_is_image_size_error(_api_err)
and len(data_url) > _RESIZE_TARGET_BYTES):
logger.info(
"Vision API rejected screenshot (%.1f MB); "
"auto-resizing to ~%.0f MB and retrying...",
len(data_url) / (1024 * 1024),
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
data_url = _resize_image_for_vision(
screenshot_path, mime_type="image/png")
call_kwargs["messages"][0]["content"][1]["image_url"]["url"] = data_url
response = call_llm(**call_kwargs)
else:
raise
analysis = (response.choices[0].message.content or "").strip()
# Redact secrets the vision LLM may have read from the screenshot.

View File

@@ -277,6 +277,120 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
return data_url
# Hard limit for vision API payloads (20 MB) — matches the most restrictive
# major provider (Gemini inline data limit). Images above this are rejected.
_MAX_BASE64_BYTES = 20 * 1024 * 1024
# Target size when auto-resizing on API failure (5 MB). After a provider
# rejects an image, we downscale to this target and retry once.
_RESIZE_TARGET_BYTES = 5 * 1024 * 1024
def _is_image_size_error(error: Exception) -> bool:
"""Detect if an API error is related to image or payload size."""
err_str = str(error).lower()
return any(hint in err_str for hint in (
"too large", "payload", "413", "content_too_large",
"request_too_large", "image_url", "invalid_request",
"exceeds", "size limit",
))
def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
max_base64_bytes: int = _RESIZE_TARGET_BYTES) -> str:
"""Convert an image to a base64 data URL, auto-resizing if too large.
Tries Pillow first to progressively downscale oversized images. If Pillow
is not installed or resizing still exceeds the limit, falls back to the raw
bytes and lets the caller handle the size check.
Returns the base64 data URL string.
"""
# Quick file-size estimate: base64 expands by ~4/3, plus data URL header.
# Skip the expensive full-read + encode if Pillow can resize directly.
file_size = image_path.stat().st_size
estimated_b64 = (file_size * 4) // 3 + 100 # ~header overhead
if estimated_b64 <= max_base64_bytes:
# Small enough — just encode directly.
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
if len(data_url) <= max_base64_bytes:
return data_url
else:
data_url = None # defer full encode; try Pillow resize first
# Attempt auto-resize with Pillow (soft dependency)
try:
from PIL import Image
import io as _io
except ImportError:
logger.info("Pillow not installed — cannot auto-resize oversized image")
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # caller will raise the size error
logger.info("Image file is %.1f MB (estimated base64 %.1f MB, limit %.1f MB), auto-resizing...",
file_size / (1024 * 1024), estimated_b64 / (1024 * 1024),
max_base64_bytes / (1024 * 1024))
mime = mime_type or _determine_mime_type(image_path)
# Choose output format: JPEG for photos (smaller), PNG for transparency
pil_format = "PNG" if mime == "image/png" else "JPEG"
out_mime = "image/png" if pil_format == "PNG" else "image/jpeg"
try:
img = Image.open(image_path)
except Exception as exc:
logger.info("Pillow cannot open image for resizing: %s", exc)
if data_url is None:
data_url = _image_to_base64_data_url(image_path, mime_type=mime_type)
return data_url # fall through to size-check in caller
# Convert RGBA to RGB for JPEG output
if pil_format == "JPEG" and img.mode in ("RGBA", "P"):
img = img.convert("RGB")
# Strategy: halve dimensions until base64 fits, up to 4 rounds.
# For JPEG, also try reducing quality at each size step.
# For PNG, quality is irrelevant — only dimension reduction helps.
quality_steps = (85, 70, 50) if pil_format == "JPEG" else (None,)
prev_dims = (img.width, img.height)
candidate = None # will be set on first loop iteration
for attempt in range(5):
if attempt > 0:
new_w = max(img.width // 2, 64)
new_h = max(img.height // 2, 64)
# Stop if dimensions can't shrink further
if (new_w, new_h) == prev_dims:
break
img = img.resize((new_w, new_h), Image.LANCZOS)
prev_dims = (new_w, new_h)
logger.info("Resized to %dx%d (attempt %d)", new_w, new_h, attempt)
for q in quality_steps:
buf = _io.BytesIO()
save_kwargs = {"format": pil_format}
if q is not None:
save_kwargs["quality"] = q
img.save(buf, **save_kwargs)
encoded = base64.b64encode(buf.getvalue()).decode("ascii")
candidate = f"data:{out_mime};base64,{encoded}"
if len(candidate) <= max_base64_bytes:
logger.info("Auto-resized image fits: %.1f MB (quality=%s, %dx%d)",
len(candidate) / (1024 * 1024), q,
img.width, img.height)
return candidate
# If we still can't get it small enough, return the best attempt
# and let the caller decide
if candidate is not None:
logger.warning("Auto-resize could not fit image under %.1f MB (best: %.1f MB)",
max_base64_bytes / (1024 * 1024), len(candidate) / (1024 * 1024))
return candidate
# Shouldn't reach here, but fall back to full encode
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
async def vision_analyze_tool(
image_url: str,
user_prompt: str,
@@ -376,24 +490,27 @@ async def vision_analyze_tool(
if not detected_mime_type:
raise ValueError("Only real image files are supported for vision analysis.")
# Convert image to base64 data URL
# Convert image to base64 — send at full resolution first.
# If the provider rejects it as too large, we auto-resize and retry.
logger.info("Converting image to base64...")
image_data_url = _image_to_base64_data_url(temp_image_path, mime_type=detected_mime_type)
# Calculate size in KB for better readability
data_size_kb = len(image_data_url) / 1024
logger.info("Image converted to base64 (%.1f KB)", data_size_kb)
# Pre-flight size check: most vision APIs cap base64 payloads at 5 MB.
# Reject early with a clear message instead of a cryptic provider 400.
_MAX_BASE64_BYTES = 5 * 1024 * 1024 # 5 MB
# The data URL includes the header (e.g. "data:image/jpeg;base64,") which
# is negligible, but measure the full string to be safe.
# Hard limit (20 MB) — no provider accepts payloads this large.
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB (limit 5 MB). "
f"Resize or compress the image and try again."
)
# Try to resize down to 5 MB before giving up.
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
if len(image_data_url) > _MAX_BASE64_BYTES:
raise ValueError(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
f"even after resizing. "
f"Install Pillow (`pip install Pillow`) for better auto-resize, "
f"or compress the image manually."
)
debug_call_data["image_size_bytes"] = image_size_bytes
@@ -442,7 +559,24 @@ async def vision_analyze_tool(
}
if model:
call_kwargs["model"] = model
response = await async_call_llm(**call_kwargs)
# Try full-size image first; on size-related rejection, downscale and retry.
try:
response = await async_call_llm(**call_kwargs)
except Exception as _api_err:
if (_is_image_size_error(_api_err)
and len(image_data_url) > _RESIZE_TARGET_BYTES):
logger.info(
"API rejected image (%.1f MB, likely too large); "
"auto-resizing to ~%.0f MB and retrying...",
len(image_data_url) / (1024 * 1024),
_RESIZE_TARGET_BYTES / (1024 * 1024),
)
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type)
messages[0]["content"][1]["image_url"]["url"] = image_data_url
response = await async_call_llm(**call_kwargs)
else:
raise
# Extract the analysis — fall back to reasoning if content is empty
analysis = extract_content_or_reasoning(response)
@@ -498,8 +632,8 @@ async def vision_analyze_tool(
elif "invalid_request" in err_str or "image_url" in err_str:
analysis = (
"The vision API rejected the image. This can happen when the "
"image is too large, in an unsupported format, or corrupted. "
"Try a smaller JPEG/PNG (under 3.5 MB) and retry. "
"image is in an unsupported format, corrupted, or still too "
"large after auto-resize. Try a smaller JPEG/PNG and retry. "
f"Error: {e}"
)
else: