Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
9d05f77a9b feat: harden vision benchmark artifacts
All checks were successful
Lint / lint (pull_request) Successful in 9s
Refs #817
2026-04-22 12:22:28 -04:00
Alexander Whitestone
23e093fc75 wip: tighten vision benchmark acceptance tests 2026-04-22 12:10:23 -04:00
Alexander Whitestone
f77ce4dff2 wip: add regression tests for vision benchmark artifacts 2026-04-22 12:07:52 -04:00
14 changed files with 1137 additions and 1116 deletions

View File

@@ -1396,8 +1396,6 @@ def normalize_anthropic_response(
"tool_use": "tool_calls",
"max_tokens": "length",
"stop_sequence": "stop",
"refusal": "content_filter",
"model_context_window_exceeded": "length",
}
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
@@ -1411,42 +1409,3 @@ def normalize_anthropic_response(
),
finish_reason,
)
def normalize_anthropic_response_v2(
response,
strip_tool_prefix: bool = False,
) -> "NormalizedResponse":
"""Normalize Anthropic response to NormalizedResponse.
Wraps the existing normalize_anthropic_response() and maps its output
to the shared transport types. This allows incremental migration
without disturbing the legacy call sites.
"""
from agent.transports.types import NormalizedResponse, build_tool_call
assistant_msg, finish_reason = normalize_anthropic_response(response, strip_tool_prefix)
tool_calls = None
if assistant_msg.tool_calls:
tool_calls = [
build_tool_call(
id=tc.id,
name=tc.function.name,
arguments=tc.function.arguments,
)
for tc in assistant_msg.tool_calls
]
provider_data = {}
if getattr(assistant_msg, "reasoning_details", None):
provider_data["reasoning_details"] = assistant_msg.reasoning_details
return NormalizedResponse(
content=assistant_msg.content,
tool_calls=tool_calls,
finish_reason=finish_reason,
reasoning=getattr(assistant_msg, "reasoning", None),
usage=None,
provider_data=provider_data or None,
)

View File

@@ -1,57 +0,0 @@
"""Transport layer types and registry for provider response normalization.
Usage:
from agent.transports import get_transport
transport = get_transport("anthropic_messages")
result = transport.normalize_response(raw_response)
"""
from agent.transports.types import ( # noqa: F401
NormalizedResponse,
ToolCall,
Usage,
build_tool_call,
map_finish_reason,
)
_REGISTRY: dict = {}
def register_transport(api_mode: str, transport_cls: type) -> None:
"""Register a transport class for an api_mode string."""
_REGISTRY[api_mode] = transport_cls
def get_transport(api_mode: str):
"""Get a transport instance for the given api_mode.
Returns None if no transport is registered for this api_mode.
This allows gradual migration — call sites can check for None
and fall back to the legacy code path.
"""
if not _REGISTRY:
_discover_transports()
cls = _REGISTRY.get(api_mode)
if cls is None:
return None
return cls()
def _discover_transports() -> None:
"""Import all transport modules to trigger auto-registration."""
try:
import agent.transports.anthropic # noqa: F401
except ImportError:
pass
try:
import agent.transports.codex # noqa: F401
except ImportError:
pass
try:
import agent.transports.chat_completions # noqa: F401
except ImportError:
pass
try:
import agent.transports.bedrock # noqa: F401
except ImportError:
pass

View File

@@ -1,95 +0,0 @@
"""Anthropic Messages API transport.
Delegates to the existing adapter functions in agent/anthropic_adapter.py.
This transport owns format conversion and normalization — NOT client lifecycle.
"""
from typing import Any, Dict, List, Optional
from agent.transports.base import ProviderTransport
from agent.transports.types import NormalizedResponse
class AnthropicTransport(ProviderTransport):
"""Transport for api_mode='anthropic_messages'."""
@property
def api_mode(self) -> str:
return "anthropic_messages"
def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
from agent.anthropic_adapter import convert_messages_to_anthropic
base_url = kwargs.get("base_url")
return convert_messages_to_anthropic(messages, base_url=base_url)
def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
from agent.anthropic_adapter import convert_tools_to_anthropic
return convert_tools_to_anthropic(tools)
def build_kwargs(
self,
model: str,
messages: List[Dict[str, Any]],
tools: Optional[List[Dict[str, Any]]] = None,
**params,
) -> Dict[str, Any]:
from agent.anthropic_adapter import build_anthropic_kwargs
return build_anthropic_kwargs(
model=model,
messages=messages,
tools=tools,
max_tokens=params.get("max_tokens", 16384),
reasoning_config=params.get("reasoning_config"),
tool_choice=params.get("tool_choice"),
is_oauth=params.get("is_oauth", False),
preserve_dots=params.get("preserve_dots", False),
context_length=params.get("context_length"),
base_url=params.get("base_url"),
fast_mode=params.get("fast_mode", False),
)
def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
from agent.anthropic_adapter import normalize_anthropic_response_v2
strip_tool_prefix = kwargs.get("strip_tool_prefix", False)
return normalize_anthropic_response_v2(response, strip_tool_prefix=strip_tool_prefix)
def validate_response(self, response: Any) -> bool:
if response is None:
return False
content_blocks = getattr(response, "content", None)
if not isinstance(content_blocks, list):
return False
if not content_blocks:
return False
return True
def extract_cache_stats(self, response: Any):
usage = getattr(response, "usage", None)
if usage is None:
return None
cached = getattr(usage, "cache_read_input_tokens", 0) or 0
written = getattr(usage, "cache_creation_input_tokens", 0) or 0
if cached or written:
return {"cached_tokens": cached, "creation_tokens": written}
return None
_STOP_REASON_MAP = {
"end_turn": "stop",
"tool_use": "tool_calls",
"max_tokens": "length",
"stop_sequence": "stop",
"refusal": "content_filter",
"model_context_window_exceeded": "length",
}
def map_finish_reason(self, raw_reason: str) -> str:
return self._STOP_REASON_MAP.get(raw_reason, "stop")
from agent.transports import register_transport # noqa: E402
register_transport("anthropic_messages", AnthropicTransport)

View File

@@ -1,61 +0,0 @@
"""Abstract base for provider transports.
A transport owns the data path for one api_mode:
convert_messages → convert_tools → build_kwargs → normalize_response
It does NOT own: client construction, streaming, credential refresh,
prompt caching, interrupt handling, or retry logic. Those stay on AIAgent.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
from agent.transports.types import NormalizedResponse
class ProviderTransport(ABC):
"""Base class for provider-specific format conversion and normalization."""
@property
@abstractmethod
def api_mode(self) -> str:
"""The api_mode string this transport handles."""
...
@abstractmethod
def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
"""Convert OpenAI-format messages to provider-native format."""
...
@abstractmethod
def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
"""Convert OpenAI-format tool definitions to provider-native format."""
...
@abstractmethod
def build_kwargs(
self,
model: str,
messages: List[Dict[str, Any]],
tools: Optional[List[Dict[str, Any]]] = None,
**params,
) -> Dict[str, Any]:
"""Build the complete provider kwargs dict."""
...
@abstractmethod
def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
"""Normalize a raw provider response to the shared NormalizedResponse type."""
...
def validate_response(self, response: Any) -> bool:
"""Optional structural validation for raw responses."""
return True
def extract_cache_stats(self, response: Any) -> Optional[Dict[str, int]]:
"""Optional cache stats extraction."""
return None
def map_finish_reason(self, raw_reason: str) -> str:
"""Optional stop-reason mapping. Defaults to passthrough."""
return raw_reason

View File

@@ -1,58 +0,0 @@
"""Shared types for normalized provider responses."""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@dataclass
class ToolCall:
"""A normalized tool call from any provider."""
id: Optional[str]
name: str
arguments: str
provider_data: Optional[Dict[str, Any]] = field(default=None, repr=False)
@dataclass
class Usage:
"""Token usage from an API response."""
prompt_tokens: int = 0
completion_tokens: int = 0
total_tokens: int = 0
cached_tokens: int = 0
@dataclass
class NormalizedResponse:
"""Normalized API response from any provider."""
content: Optional[str]
tool_calls: Optional[List[ToolCall]]
finish_reason: str
reasoning: Optional[str] = None
usage: Optional[Usage] = None
provider_data: Optional[Dict[str, Any]] = field(default=None, repr=False)
def build_tool_call(
id: Optional[str],
name: str,
arguments: Any,
**provider_fields: Any,
) -> ToolCall:
"""Build a ToolCall, auto-serialising dict arguments."""
args_str = json.dumps(arguments) if isinstance(arguments, dict) else str(arguments)
provider_data = dict(provider_fields) if provider_fields else None
return ToolCall(id=id, name=name, arguments=args_str, provider_data=provider_data)
def map_finish_reason(reason: Optional[str], mapping: Dict[str, str]) -> str:
"""Translate a provider-specific stop reason to the normalized set."""
if reason is None:
return "stop"
return mapping.get(reason, "stop")

View File

@@ -1,194 +1,757 @@
[
{
"id": "screenshot_github_home",
"id": "screenshot_github_mark",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"id": "screenshot_github_social",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_github_code_search",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_terminal_capture",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_http_404",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_01",
"url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
"category": "screenshot",
"expected_keywords": [
"hermes",
"cli",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_02",
"url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
"category": "screenshot",
"expected_keywords": [
"prompt",
"cache",
"dashboard"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_01",
"url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
"category": "screenshot",
"expected_keywords": [
"settings",
"voice",
"mode"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_02",
"url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
"category": "screenshot",
"expected_keywords": [
"browser",
"vision",
"preview"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_03",
"url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
"category": "screenshot",
"expected_keywords": [
"tool",
"call",
"inspector"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_flow_a",
"url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flowchart",
"api",
"worker"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_flow_b",
"url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
"category": "diagram",
"expected_keywords": [
"architecture",
"diagram",
"cache"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_uml_a",
"url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
"category": "diagram",
"expected_keywords": [
"class",
"diagram",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"id": "diagram_uml_b",
"url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
"category": "diagram",
"expected_keywords": [
"sequence",
"diagram",
"response"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"id": "diagram_network_a",
"url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
"category": "diagram",
"expected_keywords": [
"network",
"node",
"router"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_network_b",
"url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
"category": "diagram",
"expected_keywords": [
"service",
"mesh",
"auth"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_state_machine",
"url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
"category": "diagram",
"expected_keywords": [
"state",
"machine",
"idle"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_mind_map",
"url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
"category": "diagram",
"expected_keywords": [
"mind",
"memory",
"tools"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_pipeline",
"url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
"category": "diagram",
"expected_keywords": [
"pipeline",
"ingest",
"summarize"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"org",
"chart",
"review"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"id": "photo_random_01",
"url": "https://picsum.photos/seed/vision-bench-1/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"id": "photo_random_02",
"url": "https://picsum.photos/seed/vision-bench-2/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_03",
"url": "https://picsum.photos/seed/vision-bench-3/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_04",
"url": "https://picsum.photos/seed/vision-bench-4/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_05",
"url": "https://picsum.photos/seed/vision-bench-5/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_06",
"url": "https://picsum.photos/seed/vision-bench-6/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_07",
"url": "https://picsum.photos/seed/vision-bench-7/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_08",
"url": "https://picsum.photos/seed/vision-bench-8/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_09",
"url": "https://picsum.photos/seed/vision-bench-9/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_10",
"url": "https://picsum.photos/seed/vision-bench-10/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_bar_quarterly",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie_market",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_line_temp",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"id": "chart_radar_skill",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"id": "chart_stacked_cloud",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"id": "chart_area_growth",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
"category": "chart",
"expected_keywords": [
"line",
"growth",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"id": "chart_scatter_eval",
"url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
"category": "chart",
"expected_keywords": [
"scatter",
"chart",
"runs"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_horizontal_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
"category": "chart",
"expected_keywords": [
"bar",
"score",
"ocr"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_bubble_usage",
"url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
"category": "chart",
"expected_keywords": [
"bubble",
"latency",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_doughnut_devices",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "ocr_text_01",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
"category": "ocr",
"expected_keywords": [
"hermes",
"ocr"
],
"ground_truth_ocr": "Hermes OCR Alpha 01",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_02",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
"category": "ocr",
"expected_keywords": [
"prompt",
"cache"
],
"ground_truth_ocr": "Prompt Cache Hit 87%",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_03",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
"category": "ocr",
"expected_keywords": [
"session",
"42"
],
"ground_truth_ocr": "Session 42 Ready",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_04",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
"category": "ocr",
"expected_keywords": [
"latency",
"118"
],
"ground_truth_ocr": "Latency 118 ms",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_05",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
"category": "ocr",
"expected_keywords": [
"voice",
"mode"
],
"ground_truth_ocr": "Voice Mode Enabled",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_01",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
"category": "document",
"expected_keywords": [
"invoice",
"1001",
"total"
],
"ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_02",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
"category": "document",
"expected_keywords": [
"form",
"a",
"name"
],
"ground_truth_ocr": "Form A Name Alice Status Approved",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_03",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
"category": "document",
"expected_keywords": [
"report",
"memory",
"recall"
],
"ground_truth_ocr": "Report Memory Recall Score 91 Percent",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_04",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
"category": "document",
"expected_keywords": [
"checklist",
"crisis",
"escalation"
],
"ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_05",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
"category": "document",
"expected_keywords": [
"meeting",
"notes",
"vision"
],
"ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
}
]
]

View File

@@ -22,10 +22,12 @@ import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"fallback_provider": "ollama",
"fallback_model_id": "gemma4:latest",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"fallback_provider": "gemini",
"fallback_model_id": "gemini-2.5-flash",
"description": "Current default vision model",
},
}
@@ -84,91 +90,150 @@ async def analyze_with_model(
"""
import httpx
def _load_image_bytes_cached() -> tuple[bytes, str]:
nonlocal _image_bytes, _mime_type
if _image_bytes is not None:
return _image_bytes, _mime_type
if image_url.startswith(("http://", "https://")):
with urllib.request.urlopen(image_url, timeout=30) as resp:
_image_bytes = resp.read()
_mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
else:
path = Path(image_url).expanduser()
_image_bytes = path.read_bytes()
_mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
return _image_bytes, _mime_type
def _data_url() -> str:
image_bytes, mime_type = _load_image_bytes_cached()
return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
def _provider_key(provider: str) -> str:
if provider == "openrouter":
return os.getenv("OPENROUTER_API_KEY", "")
if provider == "nous":
return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
if provider == "gemini":
return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
return os.getenv(f"{provider.upper()}_API_KEY", "")
provider = model_config["provider"]
model_id = model_config["model_id"]
candidates = [(provider, model_id)]
if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
_image_bytes: Optional[bytes] = None
_mime_type = "image/png"
failures = []
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
for candidate_provider, candidate_model in candidates:
api_key = _provider_key(candidate_provider)
start = time.perf_counter()
try:
if candidate_provider in {"openrouter", "nous"}:
api_url = (
"https://openrouter.ai/api/v1/chat/completions"
if candidate_provider == "openrouter"
else "https://inference.nousresearch.com/v1/chat/completions"
)
if not api_key:
raise RuntimeError(f"No API key for provider {candidate_provider}")
payload = {
"model": candidate_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
],
}],
"max_tokens": 2000,
"temperature": 0.1,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
elif candidate_provider == "gemini":
if not api_key:
raise RuntimeError("No API key for provider gemini")
image_bytes, mime_type = _load_image_bytes_cached()
api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
payload = {
"contents": [{"parts": [
{"text": prompt},
{"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
]}],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload)
resp.raise_for_status()
data = resp.json()
parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
usage = data.get("usageMetadata", {})
tokens = {
"prompt_tokens": usage.get("promptTokenCount", 0),
"completion_tokens": usage.get("candidatesTokenCount", 0),
"total_tokens": usage.get("totalTokenCount", 0),
}
elif candidate_provider == "ollama":
image_bytes, _ = _load_image_bytes_cached()
payload = {
"model": candidate_model,
"stream": False,
"messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
"options": {"temperature": 0.1},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
analysis = data.get("message", {}).get("content", "")
tokens = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
}
else:
raise RuntimeError(f"Unsupported provider {candidate_provider}")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
latency_ms = (time.perf_counter() - start) * 1000
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
"provider_used": candidate_provider,
"model_used": candidate_model,
}
except Exception as e:
failures.append(f"{candidate_provider}:{candidate_model} => {e}")
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": " | ".join(failures) if failures else "No runs",
"provider_used": candidates[-1][0] if candidates else provider,
"model_used": candidates[-1][1] if candidates else model_id,
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
summary[model_name] = {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
lines += ["", "## Failure Modes", ""]
had_failures = False
for mkey, mname in config["models"].items():
model_summary = summary.get(mkey, {})
failure_examples = model_summary.get("failure_examples", [])
if not failure_examples and not model_summary.get("error"):
continue
had_failures = True
lines.append(f"### {mname}")
if model_summary.get("error"):
lines.append(f"- Summary: {model_summary['error']}")
for err in failure_examples:
lines.append(f"- {err}")
lines.append("")
if not had_failures:
lines.append("- No provider/runtime failures recorded.")
# Verdict
lines += ["", "## Verdict", ""]
@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
lines.append("")
lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
else:
lines.append("No clear winner — insufficient data.")
lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
lines.append("")
lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")
return "\n".join(lines)
@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:
def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images.
"""Generate a larger benchmark dataset aligned with issue #817.
Returns list of test image definitions.
Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
and document-like images so the harness matches the issue contract.
"""
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
dataset: List[dict] = []
screenshots = [
("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
]
for ident, url, keywords in screenshots:
dataset.append({
"id": f"screenshot_{ident}",
"url": url,
"category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
diagrams = [
("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
]
for ident, url, keywords in diagrams:
dataset.append({
"id": f"diagram_{ident}",
"url": url,
"category": "diagram",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
})
for idx in range(1, 11):
dataset.append({
"id": f"photo_random_{idx:02d}",
"url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
charts = [
("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
]
for ident, url, keywords in charts:
dataset.append({
"id": f"chart_{ident}",
"url": url,
"category": "chart",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
})
ocr_texts = [
"Hermes OCR Alpha 01",
"Prompt Cache Hit 87%",
"Session 42 Ready",
"Latency 118 ms",
"Voice Mode Enabled",
]
for idx, text in enumerate(ocr_texts, start=1):
dataset.append({
"id": f"ocr_text_{idx:02d}",
"url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
"category": "ocr",
"expected_keywords": text.lower().split()[:2],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
documents = [
"Invoice 1001 Total 42 Due 2026-04-22",
"Form A Name Alice Status Approved",
"Report Memory Recall Score 91 Percent",
"Checklist Crisis Escalation Call 988 Now",
"Meeting Notes Vision Benchmark Run Pending",
]
for idx, text in enumerate(documents, start=1):
dataset.append({
"id": f"document_text_{idx:02d}",
"url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
"category": "document",
"expected_keywords": text.lower().split()[:3],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
return dataset
def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
print("ERROR: Provide --images or --url")
sys.exit(1)
if args.limit and args.limit > 0:
images = images[:args.limit]
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
markdown_report = to_markdown(report)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown_output:
os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
with open(args.markdown_output, "w", encoding="utf-8") as f:
f.write(markdown_report)
print(f"Markdown report saved to {args.markdown_output}")
if args.markdown or not args.output:
print("\n" + to_markdown(report))
print("\n" + markdown_report)
if __name__ == "__main__":

View File

@@ -0,0 +1,67 @@
{
"generated_at": "2026-04-22T16:21:56.271426+00:00",
"config": {
"total_images": 2,
"runs_per_model": 1,
"models": {
"gemma4": "Gemma 4 27B",
"gemini3_flash": "Gemini 3 Flash Preview"
}
},
"results": [
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_mark",
"category": "screenshot"
},
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_social",
"category": "screenshot"
}
],
"summary": {
"gemma4": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
]
},
"gemini3_flash": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
]
}
}
}

View File

@@ -0,0 +1,44 @@
# Vision Benchmark Report
Generated: 2026-04-22T16:21
Images tested: 2
Runs per model: 1
Models: Gemma 4 27B, Gemini 3 Flash Preview
## Latency Comparison
| Model | Mean (ms) | Median | P95 | Std Dev |
|-------|-----------|--------|-----|---------|
## Accuracy Comparison
| Model | OCR Accuracy | Keyword Coverage | Success Rate |
|-------|-------------|-----------------|--------------|
## Token Usage
| Model | Mean Tokens/Image | Total Tokens |
|-------|------------------|--------------|
## Failure Modes
### Gemma 4 27B
- Summary: All runs failed
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
### Gemini 3 Flash Preview
- Summary: All runs failed
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
## Verdict
Benchmark blocked or insufficient data for a trustworthy winner.
Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.

View File

@@ -1,213 +0,0 @@
"""Regression tests: normalize_anthropic_response_v2 vs v1.
Constructs mock Anthropic responses and asserts that the v2 function
(returning NormalizedResponse) produces identical field values to the
original v1 function (returning SimpleNamespace + finish_reason).
"""
from types import SimpleNamespace
import pytest
from agent.anthropic_adapter import (
normalize_anthropic_response,
normalize_anthropic_response_v2,
)
from agent.transports.types import NormalizedResponse
def _text_block(text: str):
return SimpleNamespace(type="text", text=text)
def _thinking_block(thinking: str, signature: str = "sig_abc"):
return SimpleNamespace(type="thinking", thinking=thinking, signature=signature)
def _tool_use_block(id: str, name: str, input: dict):
return SimpleNamespace(type="tool_use", id=id, name=name, input=input)
def _response(content_blocks, stop_reason="end_turn"):
return SimpleNamespace(
content=content_blocks,
stop_reason=stop_reason,
usage=SimpleNamespace(input_tokens=10, output_tokens=5),
)
class TestTextOnly:
def setup_method(self):
self.resp = _response([_text_block("Hello world")])
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
self.v2 = normalize_anthropic_response_v2(self.resp)
def test_type(self):
assert isinstance(self.v2, NormalizedResponse)
def test_content_matches(self):
assert self.v2.content == self.v1_msg.content
def test_finish_reason_matches(self):
assert self.v2.finish_reason == self.v1_finish
def test_no_tool_calls(self):
assert self.v2.tool_calls is None
assert self.v1_msg.tool_calls is None
def test_no_reasoning(self):
assert self.v2.reasoning is None
assert self.v1_msg.reasoning is None
class TestWithToolCalls:
def setup_method(self):
self.resp = _response(
[
_text_block("I'll check that"),
_tool_use_block("toolu_abc", "terminal", {"command": "ls"}),
_tool_use_block("toolu_def", "read_file", {"path": "/tmp"}),
],
stop_reason="tool_use",
)
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
self.v2 = normalize_anthropic_response_v2(self.resp)
def test_finish_reason(self):
assert self.v2.finish_reason == "tool_calls"
assert self.v1_finish == "tool_calls"
def test_tool_call_count(self):
assert len(self.v2.tool_calls) == 2
assert len(self.v1_msg.tool_calls) == 2
def test_tool_call_ids_match(self):
for i in range(2):
assert self.v2.tool_calls[i].id == self.v1_msg.tool_calls[i].id
def test_tool_call_names_match(self):
assert self.v2.tool_calls[0].name == "terminal"
assert self.v2.tool_calls[1].name == "read_file"
for i in range(2):
assert self.v2.tool_calls[i].name == self.v1_msg.tool_calls[i].function.name
def test_tool_call_arguments_match(self):
for i in range(2):
assert self.v2.tool_calls[i].arguments == self.v1_msg.tool_calls[i].function.arguments
def test_content_preserved(self):
assert self.v2.content == self.v1_msg.content
assert "check that" in self.v2.content
class TestWithThinking:
def setup_method(self):
self.resp = _response([
_thinking_block("Let me think about this carefully..."),
_text_block("The answer is 42."),
])
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
self.v2 = normalize_anthropic_response_v2(self.resp)
def test_reasoning_matches(self):
assert self.v2.reasoning == self.v1_msg.reasoning
assert "think about this" in self.v2.reasoning
def test_reasoning_details_in_provider_data(self):
v1_details = self.v1_msg.reasoning_details
v2_details = self.v2.provider_data.get("reasoning_details") if self.v2.provider_data else None
assert v1_details is not None
assert v2_details is not None
assert len(v2_details) == len(v1_details)
def test_content_excludes_thinking(self):
assert self.v2.content == "The answer is 42."
class TestMixed:
def setup_method(self):
self.resp = _response(
[
_thinking_block("Planning my approach..."),
_text_block("I'll run the command"),
_tool_use_block("toolu_xyz", "terminal", {"command": "pwd"}),
],
stop_reason="tool_use",
)
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
self.v2 = normalize_anthropic_response_v2(self.resp)
def test_all_fields_present(self):
assert self.v2.content is not None
assert self.v2.tool_calls is not None
assert self.v2.reasoning is not None
assert self.v2.finish_reason == "tool_calls"
def test_content_matches(self):
assert self.v2.content == self.v1_msg.content
def test_reasoning_matches(self):
assert self.v2.reasoning == self.v1_msg.reasoning
def test_tool_call_matches(self):
assert self.v2.tool_calls[0].id == self.v1_msg.tool_calls[0].id
assert self.v2.tool_calls[0].name == self.v1_msg.tool_calls[0].function.name
class TestStopReasons:
@pytest.mark.parametrize("stop_reason,expected", [
("end_turn", "stop"),
("tool_use", "tool_calls"),
("max_tokens", "length"),
("stop_sequence", "stop"),
("refusal", "content_filter"),
("model_context_window_exceeded", "length"),
("unknown_future_reason", "stop"),
])
def test_stop_reason_mapping(self, stop_reason, expected):
resp = _response([_text_block("x")], stop_reason=stop_reason)
_v1_msg, v1_finish = normalize_anthropic_response(resp)
v2 = normalize_anthropic_response_v2(resp)
assert v2.finish_reason == v1_finish == expected
class TestStripToolPrefix:
def test_prefix_stripped(self):
resp = _response(
[_tool_use_block("toolu_1", "mcp_terminal", {"cmd": "ls"})],
stop_reason="tool_use",
)
v1_msg, _ = normalize_anthropic_response(resp, strip_tool_prefix=True)
v2 = normalize_anthropic_response_v2(resp, strip_tool_prefix=True)
assert v1_msg.tool_calls[0].function.name == "terminal"
assert v2.tool_calls[0].name == "terminal"
def test_prefix_kept(self):
resp = _response(
[_tool_use_block("toolu_1", "mcp_terminal", {"cmd": "ls"})],
stop_reason="tool_use",
)
v1_msg, _ = normalize_anthropic_response(resp, strip_tool_prefix=False)
v2 = normalize_anthropic_response_v2(resp, strip_tool_prefix=False)
assert v1_msg.tool_calls[0].function.name == "mcp_terminal"
assert v2.tool_calls[0].name == "mcp_terminal"
class TestEdgeCases:
def test_empty_content_blocks(self):
resp = _response([])
v1_msg, _v1_finish = normalize_anthropic_response(resp)
v2 = normalize_anthropic_response_v2(resp)
assert v2.content == v1_msg.content
assert v2.content is None
def test_no_reasoning_details_means_none_provider_data(self):
resp = _response([_text_block("hi")])
v2 = normalize_anthropic_response_v2(resp)
assert v2.provider_data is None
def test_v2_returns_dataclass_not_namespace(self):
resp = _response([_text_block("hi")])
v2 = normalize_anthropic_response_v2(resp)
assert isinstance(v2, NormalizedResponse)
assert not isinstance(v2, SimpleNamespace)

View File

@@ -1,208 +0,0 @@
"""Tests for the transport ABC, registry, and AnthropicTransport."""
from types import SimpleNamespace
import pytest
from agent.transports import _REGISTRY, get_transport, register_transport
from agent.transports.base import ProviderTransport
from agent.transports.types import NormalizedResponse
class TestProviderTransportABC:
def test_cannot_instantiate_abc(self):
with pytest.raises(TypeError):
ProviderTransport()
def test_concrete_must_implement_all_abstract(self):
class Incomplete(ProviderTransport):
@property
def api_mode(self):
return "test"
with pytest.raises(TypeError):
Incomplete()
def test_minimal_concrete(self):
class Minimal(ProviderTransport):
@property
def api_mode(self):
return "test_minimal"
def convert_messages(self, messages, **kw):
return messages
def convert_tools(self, tools):
return tools
def build_kwargs(self, model, messages, tools=None, **params):
return {"model": model, "messages": messages}
def normalize_response(self, response, **kw):
return NormalizedResponse(content="ok", tool_calls=None, finish_reason="stop")
t = Minimal()
assert t.api_mode == "test_minimal"
assert t.validate_response(None) is True
assert t.extract_cache_stats(None) is None
assert t.map_finish_reason("end_turn") == "end_turn"
class TestTransportRegistry:
def test_get_unregistered_returns_none(self):
assert get_transport("nonexistent_mode") is None
def test_anthropic_registered_on_import(self):
import agent.transports.anthropic # noqa: F401
t = get_transport("anthropic_messages")
assert t is not None
assert t.api_mode == "anthropic_messages"
def test_register_and_get(self):
class DummyTransport(ProviderTransport):
@property
def api_mode(self):
return "dummy_test"
def convert_messages(self, messages, **kw):
return messages
def convert_tools(self, tools):
return tools
def build_kwargs(self, model, messages, tools=None, **params):
return {}
def normalize_response(self, response, **kw):
return NormalizedResponse(content=None, tool_calls=None, finish_reason="stop")
register_transport("dummy_test", DummyTransport)
t = get_transport("dummy_test")
assert t.api_mode == "dummy_test"
_REGISTRY.pop("dummy_test", None)
class TestAnthropicTransport:
@pytest.fixture
def transport(self):
import agent.transports.anthropic # noqa: F401
return get_transport("anthropic_messages")
def test_api_mode(self, transport):
assert transport.api_mode == "anthropic_messages"
def test_convert_tools_simple(self, transport):
tools = [{
"type": "function",
"function": {
"name": "test_tool",
"description": "A test",
"parameters": {"type": "object", "properties": {}},
},
}]
result = transport.convert_tools(tools)
assert len(result) == 1
assert result[0]["name"] == "test_tool"
assert "input_schema" in result[0]
def test_validate_response_none(self, transport):
assert transport.validate_response(None) is False
def test_validate_response_empty_content(self, transport):
r = SimpleNamespace(content=[])
assert transport.validate_response(r) is False
def test_validate_response_valid(self, transport):
r = SimpleNamespace(content=[SimpleNamespace(type="text", text="hello")])
assert transport.validate_response(r) is True
def test_map_finish_reason(self, transport):
assert transport.map_finish_reason("end_turn") == "stop"
assert transport.map_finish_reason("tool_use") == "tool_calls"
assert transport.map_finish_reason("max_tokens") == "length"
assert transport.map_finish_reason("stop_sequence") == "stop"
assert transport.map_finish_reason("refusal") == "content_filter"
assert transport.map_finish_reason("model_context_window_exceeded") == "length"
assert transport.map_finish_reason("unknown") == "stop"
def test_extract_cache_stats_none_usage(self, transport):
r = SimpleNamespace(usage=None)
assert transport.extract_cache_stats(r) is None
def test_extract_cache_stats_with_cache(self, transport):
usage = SimpleNamespace(cache_read_input_tokens=100, cache_creation_input_tokens=50)
r = SimpleNamespace(usage=usage)
result = transport.extract_cache_stats(r)
assert result == {"cached_tokens": 100, "creation_tokens": 50}
def test_extract_cache_stats_zero(self, transport):
usage = SimpleNamespace(cache_read_input_tokens=0, cache_creation_input_tokens=0)
r = SimpleNamespace(usage=usage)
assert transport.extract_cache_stats(r) is None
def test_normalize_response_text(self, transport):
r = SimpleNamespace(
content=[SimpleNamespace(type="text", text="Hello world")],
stop_reason="end_turn",
usage=SimpleNamespace(input_tokens=10, output_tokens=5),
model="claude-sonnet-4-6",
)
nr = transport.normalize_response(r)
assert isinstance(nr, NormalizedResponse)
assert nr.content == "Hello world"
assert nr.tool_calls is None or nr.tool_calls == []
assert nr.finish_reason == "stop"
def test_normalize_response_tool_calls(self, transport):
r = SimpleNamespace(
content=[
SimpleNamespace(type="tool_use", id="toolu_123", name="terminal", input={"command": "ls"}),
],
stop_reason="tool_use",
usage=SimpleNamespace(input_tokens=10, output_tokens=20),
model="claude-sonnet-4-6",
)
nr = transport.normalize_response(r)
assert nr.finish_reason == "tool_calls"
assert len(nr.tool_calls) == 1
tc = nr.tool_calls[0]
assert tc.name == "terminal"
assert tc.id == "toolu_123"
assert '"command"' in tc.arguments
def test_normalize_response_thinking(self, transport):
r = SimpleNamespace(
content=[
SimpleNamespace(type="thinking", thinking="Let me think..."),
SimpleNamespace(type="text", text="The answer is 42"),
],
stop_reason="end_turn",
usage=SimpleNamespace(input_tokens=10, output_tokens=15),
model="claude-sonnet-4-6",
)
nr = transport.normalize_response(r)
assert nr.content == "The answer is 42"
assert nr.reasoning == "Let me think..."
def test_build_kwargs_returns_dict(self, transport):
messages = [{"role": "user", "content": "Hello"}]
kw = transport.build_kwargs(
model="claude-sonnet-4-6",
messages=messages,
max_tokens=1024,
)
assert isinstance(kw, dict)
assert "model" in kw
assert "max_tokens" in kw
assert "messages" in kw
def test_convert_messages_extracts_system(self, transport):
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hi"},
]
system, msgs = transport.convert_messages(messages)
assert system is not None
assert len(msgs) >= 1

View File

@@ -1,130 +0,0 @@
"""Tests for agent/transports/types.py — dataclass construction + helpers."""
import json
from agent.transports.types import (
NormalizedResponse,
ToolCall,
Usage,
build_tool_call,
map_finish_reason,
)
class TestToolCall:
def test_basic_construction(self):
tc = ToolCall(id="call_abc", name="terminal", arguments='{"cmd": "ls"}')
assert tc.id == "call_abc"
assert tc.name == "terminal"
assert tc.arguments == '{"cmd": "ls"}'
assert tc.provider_data is None
def test_none_id(self):
tc = ToolCall(id=None, name="read_file", arguments="{}")
assert tc.id is None
def test_provider_data(self):
tc = ToolCall(
id="call_x",
name="t",
arguments="{}",
provider_data={"call_id": "call_x", "response_item_id": "fc_x"},
)
assert tc.provider_data["call_id"] == "call_x"
assert tc.provider_data["response_item_id"] == "fc_x"
class TestUsage:
def test_defaults(self):
u = Usage()
assert u.prompt_tokens == 0
assert u.completion_tokens == 0
assert u.total_tokens == 0
assert u.cached_tokens == 0
def test_explicit(self):
u = Usage(prompt_tokens=100, completion_tokens=50, total_tokens=150, cached_tokens=80)
assert u.total_tokens == 150
class TestNormalizedResponse:
def test_text_only(self):
r = NormalizedResponse(content="hello", tool_calls=None, finish_reason="stop")
assert r.content == "hello"
assert r.tool_calls is None
assert r.finish_reason == "stop"
assert r.reasoning is None
assert r.usage is None
assert r.provider_data is None
def test_with_tool_calls(self):
tcs = [ToolCall(id="call_1", name="terminal", arguments='{"cmd":"pwd"}')]
r = NormalizedResponse(content=None, tool_calls=tcs, finish_reason="tool_calls")
assert r.finish_reason == "tool_calls"
assert len(r.tool_calls) == 1
assert r.tool_calls[0].name == "terminal"
def test_with_reasoning(self):
r = NormalizedResponse(
content="answer",
tool_calls=None,
finish_reason="stop",
reasoning="I thought about it",
)
assert r.reasoning == "I thought about it"
def test_with_provider_data(self):
r = NormalizedResponse(
content=None,
tool_calls=None,
finish_reason="stop",
provider_data={"reasoning_details": [{"type": "thinking", "thinking": "hmm"}]},
)
assert r.provider_data["reasoning_details"][0]["type"] == "thinking"
class TestBuildToolCall:
def test_dict_arguments_serialized(self):
tc = build_tool_call(id="call_1", name="terminal", arguments={"cmd": "ls"})
assert tc.arguments == json.dumps({"cmd": "ls"})
assert tc.provider_data is None
def test_string_arguments_passthrough(self):
tc = build_tool_call(id="call_2", name="read_file", arguments='{"path": "/tmp"}')
assert tc.arguments == '{"path": "/tmp"}'
def test_provider_fields(self):
tc = build_tool_call(
id="call_3",
name="terminal",
arguments="{}",
call_id="call_3",
response_item_id="fc_3",
)
assert tc.provider_data == {"call_id": "call_3", "response_item_id": "fc_3"}
def test_none_id(self):
tc = build_tool_call(id=None, name="t", arguments="{}")
assert tc.id is None
class TestMapFinishReason:
ANTHROPIC_MAP = {
"end_turn": "stop",
"tool_use": "tool_calls",
"max_tokens": "length",
"stop_sequence": "stop",
"refusal": "content_filter",
}
def test_known_reason(self):
assert map_finish_reason("end_turn", self.ANTHROPIC_MAP) == "stop"
assert map_finish_reason("tool_use", self.ANTHROPIC_MAP) == "tool_calls"
assert map_finish_reason("max_tokens", self.ANTHROPIC_MAP) == "length"
assert map_finish_reason("refusal", self.ANTHROPIC_MAP) == "content_filter"
def test_unknown_reason_defaults_to_stop(self):
assert map_finish_reason("something_new", self.ANTHROPIC_MAP) == "stop"
def test_none_reason(self):
assert map_finish_reason(None, self.ANTHROPIC_MAP) == "stop"

View File

@@ -199,7 +199,7 @@ class TestMarkdown:
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4
assert len(dataset) >= 50
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
@@ -216,6 +216,9 @@ class TestDataset:
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
assert "chart" in categories
assert "ocr" in categories
assert "document" in categories
class TestModels:

View File

@@ -0,0 +1,21 @@
import json
from pathlib import Path
DATASET = Path("benchmarks/test_images.json")
REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
items = json.loads(DATASET.read_text(encoding="utf-8"))
assert len(items) >= 50
categories = {item["category"] for item in items}
assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
def test_metrics_report_exists_with_recommendation() -> None:
assert REPORT.exists(), "missing benchmark report under metrics/"
text = REPORT.read_text(encoding="utf-8")
assert "Recommendation" in text
assert "Gemma 4" in text
assert "Gemini" in text