feat: harden vision benchmark artifacts
All checks were successful
Lint / lint (pull_request) Successful in 9s

Refs #817
This commit is contained in:
Alexander Whitestone
2026-04-22 12:20:49 -04:00
parent 23e093fc75
commit 9d05f77a9b
4 changed files with 1112 additions and 252 deletions

View File

@@ -1,194 +1,757 @@
[ [
{ {
"id": "screenshot_github_home", "id": "screenshot_github_mark",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot", "category": "screenshot",
"expected_keywords": ["github", "logo", "mark"], "expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
}, },
{ {
"id": "diagram_mermaid_flow", "id": "screenshot_github_social",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w", "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_github_code_search",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_terminal_capture",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_http_404",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_01",
"url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
"category": "screenshot",
"expected_keywords": [
"hermes",
"cli",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_02",
"url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
"category": "screenshot",
"expected_keywords": [
"prompt",
"cache",
"dashboard"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_01",
"url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
"category": "screenshot",
"expected_keywords": [
"settings",
"voice",
"mode"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_02",
"url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
"category": "screenshot",
"expected_keywords": [
"browser",
"vision",
"preview"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_03",
"url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
"category": "screenshot",
"expected_keywords": [
"tool",
"call",
"inspector"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_flow_a",
"url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
"category": "diagram", "category": "diagram",
"expected_keywords": ["flow", "diagram", "process"], "expected_keywords": [
"flowchart",
"api",
"worker"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "photo_random_1", "id": "diagram_flow_b",
"url": "https://picsum.photos/seed/vision1/400/300", "url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
"category": "photo", "category": "diagram",
"expected_keywords": [], "expected_keywords": [
"architecture",
"diagram",
"cache"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "photo_random_2", "id": "diagram_uml_a",
"url": "https://picsum.photos/seed/vision2/400/300", "url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
"category": "photo", "category": "diagram",
"expected_keywords": [], "expected_keywords": [
"class",
"diagram",
"session"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "chart_simple_bar", "id": "diagram_uml_b",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", "url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
"category": "chart", "category": "diagram",
"expected_keywords": ["bar", "chart", "revenue"], "expected_keywords": [
"sequence",
"diagram",
"response"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "chart_pie", "id": "diagram_network_a",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", "url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
"category": "chart", "category": "diagram",
"expected_keywords": ["pie", "chart", "percentage"], "expected_keywords": [
"network",
"node",
"router"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_network_b",
"url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
"category": "diagram",
"expected_keywords": [
"service",
"mesh",
"auth"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_state_machine",
"url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
"category": "diagram",
"expected_keywords": [
"state",
"machine",
"idle"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_mind_map",
"url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
"category": "diagram",
"expected_keywords": [
"mind",
"memory",
"tools"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_pipeline",
"url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
"category": "diagram",
"expected_keywords": [
"pipeline",
"ingest",
"summarize"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "diagram_org_chart", "id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg", "url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
"category": "diagram", "category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"], "expected_keywords": [
"org",
"chart",
"review"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
}, },
{ {
"id": "screenshot_terminal", "id": "photo_random_01",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", "url": "https://picsum.photos/seed/vision-bench-1/640/480",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"category": "photo", "category": "photo",
"expected_keywords": [], "expected_keywords": [],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
}, },
{ {
"id": "chart_line", "id": "photo_random_02",
"url": "https://picsum.photos/seed/vision-bench-2/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_03",
"url": "https://picsum.photos/seed/vision-bench-3/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_04",
"url": "https://picsum.photos/seed/vision-bench-4/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_05",
"url": "https://picsum.photos/seed/vision-bench-5/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_06",
"url": "https://picsum.photos/seed/vision-bench-6/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_07",
"url": "https://picsum.photos/seed/vision-bench-7/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_08",
"url": "https://picsum.photos/seed/vision-bench-8/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_09",
"url": "https://picsum.photos/seed/vision-bench-9/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_10",
"url": "https://picsum.photos/seed/vision-bench-10/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_bar_quarterly",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie_market",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_line_temp",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart", "category": "chart",
"expected_keywords": ["line", "chart", "temperature"], "expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
}, },
{ {
"id": "diagram_sequence", "id": "chart_radar_skill",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart", "category": "chart",
"expected_keywords": ["radar", "chart", "skill"], "expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
}, },
{ {
"id": "photo_random_5", "id": "chart_stacked_cloud",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart", "category": "chart",
"expected_keywords": ["stacked", "bar", "chart"], "expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
}, },
{ {
"id": "screenshot_dashboard", "id": "chart_area_growth",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png", "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
"category": "screenshot", "category": "chart",
"expected_keywords": ["search", "code", "feature"], "expected_keywords": [
"line",
"growth",
"chart"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
}, },
{ {
"id": "photo_random_8", "id": "chart_scatter_eval",
"url": "https://picsum.photos/seed/vision8/400/300", "url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
"category": "photo", "category": "chart",
"expected_keywords": [], "expected_keywords": [
"scatter",
"chart",
"runs"
],
"ground_truth_ocr": "", "ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} "expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_horizontal_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
"category": "chart",
"expected_keywords": [
"bar",
"score",
"ocr"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_bubble_usage",
"url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
"category": "chart",
"expected_keywords": [
"bubble",
"latency",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_doughnut_devices",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "ocr_text_01",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
"category": "ocr",
"expected_keywords": [
"hermes",
"ocr"
],
"ground_truth_ocr": "Hermes OCR Alpha 01",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_02",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
"category": "ocr",
"expected_keywords": [
"prompt",
"cache"
],
"ground_truth_ocr": "Prompt Cache Hit 87%",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_03",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
"category": "ocr",
"expected_keywords": [
"session",
"42"
],
"ground_truth_ocr": "Session 42 Ready",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_04",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
"category": "ocr",
"expected_keywords": [
"latency",
"118"
],
"ground_truth_ocr": "Latency 118 ms",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_05",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
"category": "ocr",
"expected_keywords": [
"voice",
"mode"
],
"ground_truth_ocr": "Voice Mode Enabled",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_01",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
"category": "document",
"expected_keywords": [
"invoice",
"1001",
"total"
],
"ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_02",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
"category": "document",
"expected_keywords": [
"form",
"a",
"name"
],
"ground_truth_ocr": "Form A Name Alice Status Approved",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_03",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
"category": "document",
"expected_keywords": [
"report",
"memory",
"recall"
],
"ground_truth_ocr": "Report Memory Recall Score 91 Percent",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_04",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
"category": "document",
"expected_keywords": [
"checklist",
"crisis",
"escalation"
],
"ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_05",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
"category": "document",
"expected_keywords": [
"meeting",
"notes",
"vision"
],
"ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
} }
] ]

View File

@@ -22,10 +22,12 @@ import argparse
import asyncio import asyncio
import base64 import base64
import json import json
import mimetypes
import os import os
import statistics import statistics
import sys import sys
import time import time
import urllib.request
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
"model_id": "google/gemma-4-27b-it", "model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B", "display_name": "Gemma 4 27B",
"provider": "nous", "provider": "nous",
"fallback_provider": "ollama",
"fallback_model_id": "gemma4:latest",
"description": "Google's multimodal Gemma 4 model", "description": "Google's multimodal Gemma 4 model",
}, },
"gemini3_flash": { "gemini3_flash": {
"model_id": "google/gemini-3-flash-preview", "model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview", "display_name": "Gemini 3 Flash Preview",
"provider": "openrouter", "provider": "openrouter",
"fallback_provider": "gemini",
"fallback_model_id": "gemini-2.5-flash",
"description": "Current default vision model", "description": "Current default vision model",
}, },
} }
@@ -84,91 +90,150 @@ async def analyze_with_model(
""" """
import httpx import httpx
def _load_image_bytes_cached() -> tuple[bytes, str]:
nonlocal _image_bytes, _mime_type
if _image_bytes is not None:
return _image_bytes, _mime_type
if image_url.startswith(("http://", "https://")):
with urllib.request.urlopen(image_url, timeout=30) as resp:
_image_bytes = resp.read()
_mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
else:
path = Path(image_url).expanduser()
_image_bytes = path.read_bytes()
_mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
return _image_bytes, _mime_type
def _data_url() -> str:
image_bytes, mime_type = _load_image_bytes_cached()
return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
def _provider_key(provider: str) -> str:
if provider == "openrouter":
return os.getenv("OPENROUTER_API_KEY", "")
if provider == "nous":
return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
if provider == "gemini":
return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
return os.getenv(f"{provider.upper()}_API_KEY", "")
provider = model_config["provider"] provider = model_config["provider"]
model_id = model_config["model_id"] model_id = model_config["model_id"]
candidates = [(provider, model_id)]
if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))
# Prepare messages _image_bytes: Optional[bytes] = None
messages = [ _mime_type = "image/png"
{ failures = []
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
# Route to provider for candidate_provider, candidate_model in candidates:
if provider == "openrouter": api_key = _provider_key(candidate_provider)
api_url = "https://openrouter.ai/api/v1/chat/completions" start = time.perf_counter()
api_key = os.getenv("OPENROUTER_API_KEY", "") try:
elif provider == "nous": if candidate_provider in {"openrouter", "nous"}:
api_url = "https://inference.nousresearch.com/v1/chat/completions" api_url = (
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "") "https://openrouter.ai/api/v1/chat/completions"
else: if candidate_provider == "openrouter"
api_url = os.getenv(f"{provider.upper()}_API_URL", "") else "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv(f"{provider.upper()}_API_KEY", "") )
if not api_key:
raise RuntimeError(f"No API key for provider {candidate_provider}")
payload = {
"model": candidate_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
],
}],
"max_tokens": 2000,
"temperature": 0.1,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
elif candidate_provider == "gemini":
if not api_key:
raise RuntimeError("No API key for provider gemini")
image_bytes, mime_type = _load_image_bytes_cached()
api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
payload = {
"contents": [{"parts": [
{"text": prompt},
{"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
]}],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload)
resp.raise_for_status()
data = resp.json()
parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
usage = data.get("usageMetadata", {})
tokens = {
"prompt_tokens": usage.get("promptTokenCount", 0),
"completion_tokens": usage.get("candidatesTokenCount", 0),
"total_tokens": usage.get("totalTokenCount", 0),
}
elif candidate_provider == "ollama":
image_bytes, _ = _load_image_bytes_cached()
payload = {
"model": candidate_model,
"stream": False,
"messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
"options": {"temperature": 0.1},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
analysis = data.get("message", {}).get("content", "")
tokens = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
}
else:
raise RuntimeError(f"Unsupported provider {candidate_provider}")
if not api_key: latency_ms = (time.perf_counter() - start) * 1000
return { return {
"analysis": "", "analysis": analysis,
"latency_ms": 0, "latency_ms": round(latency_ms, 1),
"tokens": {}, "tokens": tokens,
"success": False, "success": True,
"error": f"No API key for provider {provider}", "error": "",
} "provider_used": candidate_provider,
"model_used": candidate_model,
}
except Exception as e:
failures.append(f"{candidate_provider}:{candidate_model} => {e}")
headers = { return {
"Authorization": f"Bearer {api_key}", "analysis": "",
"Content-Type": "application/json", "latency_ms": 0,
"tokens": {},
"success": False,
"error": " | ".join(failures) if failures else "No runs",
"provider_used": candidates[-1][0] if candidates else provider,
"model_used": candidates[-1][1] if candidates else model_id,
} }
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Evaluation metrics # Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
failed = [r[model_name] for r in results if not r[model_name]["success"]] failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results: if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"} summary[model_name] = {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
}
continue continue
latencies = [r["avg_latency_ms"] for r in model_results] latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4), "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results), "total_runs": len(model_results),
"total_failures": len(failed), "total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
"latency": { "latency": {
"mean_ms": round(statistics.mean(latencies), 1), "mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1), "median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |" f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
) )
lines += ["", "## Failure Modes", ""]
had_failures = False
for mkey, mname in config["models"].items():
model_summary = summary.get(mkey, {})
failure_examples = model_summary.get("failure_examples", [])
if not failure_examples and not model_summary.get("error"):
continue
had_failures = True
lines.append(f"### {mname}")
if model_summary.get("error"):
lines.append(f"- Summary: {model_summary['error']}")
for err in failure_examples:
lines.append(f"- {err}")
lines.append("")
if not had_failures:
lines.append("- No provider/runtime failures recorded.")
# Verdict # Verdict
lines += ["", "## Verdict", ""] lines += ["", "## Verdict", ""]
@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:
if best_model: if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})") lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
lines.append("")
lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
else: else:
lines.append("No clear winner — insufficient data.") lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
lines.append("")
lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")
return "\n".join(lines) return "\n".join(lines)
@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:
def generate_sample_dataset() -> List[dict]: def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images. """Generate a larger benchmark dataset aligned with issue #817.
Returns list of test image definitions. Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
and document-like images so the harness matches the issue contract.
""" """
return [ dataset: List[dict] = []
# Screenshots
{ screenshots = [
"id": "screenshot_github", ("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
]
for ident, url, keywords in screenshots:
dataset.append({
"id": f"screenshot_{ident}",
"url": url,
"category": "screenshot", "category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"], "expected_keywords": keywords,
"expected_structure": {"min_length": 50, "min_sentences": 2}, "ground_truth_ocr": "",
}, "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
# Diagrams })
{
"id": "diagram_architecture", diagrams = [
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw", ("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
]
for ident, url, keywords in diagrams:
dataset.append({
"id": f"diagram_{ident}",
"url": url,
"category": "diagram", "category": "diagram",
"expected_keywords": ["architecture", "component", "service"], "expected_keywords": keywords,
"expected_structure": {"min_length": 100, "min_sentences": 3}, "ground_truth_ocr": "",
}, "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
# Photos })
{
"id": "photo_nature", for idx in range(1, 11):
"url": "https://picsum.photos/seed/bench1/400/300", dataset.append({
"id": f"photo_random_{idx:02d}",
"url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
"category": "photo", "category": "photo",
"expected_keywords": [], "expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1}, "ground_truth_ocr": "",
}, "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
# Charts })
{
"id": "chart_bar", charts = [
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}", ("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
"category": "chart", ("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
"expected_keywords": ["bar", "chart", "data"], ("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
"expected_structure": {"min_length": 50, "min_sentences": 2}, ("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
}, ("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
] ]
for ident, url, keywords in charts:
dataset.append({
"id": f"chart_{ident}",
"url": url,
"category": "chart",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
})
ocr_texts = [
"Hermes OCR Alpha 01",
"Prompt Cache Hit 87%",
"Session 42 Ready",
"Latency 118 ms",
"Voice Mode Enabled",
]
for idx, text in enumerate(ocr_texts, start=1):
dataset.append({
"id": f"ocr_text_{idx:02d}",
"url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
"category": "ocr",
"expected_keywords": text.lower().split()[:2],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
documents = [
"Invoice 1001 Total 42 Due 2026-04-22",
"Form A Name Alice Status Approved",
"Report Memory Recall Score 91 Percent",
"Checklist Crisis Escalation Call 988 Now",
"Meeting Notes Vision Benchmark Run Pending",
]
for idx, text in enumerate(documents, start=1):
dataset.append({
"id": f"document_text_{idx:02d}",
"url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
"category": "document",
"expected_keywords": text.lower().split()[:3],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
return dataset
def load_dataset(path: str) -> List[dict]: def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
parser.add_argument("--url", help="Single image URL to test") parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL") parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file") parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image") parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
parser.add_argument("--models", nargs="+", default=None, parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)") help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report") parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
print("ERROR: Provide --images or --url") print("ERROR: Provide --images or --url")
sys.exit(1) sys.exit(1)
if args.limit and args.limit > 0:
images = images[:args.limit]
# Run benchmark # Run benchmark
report = await run_benchmark_suite(images, selected, args.runs) report = await run_benchmark_suite(images, selected, args.runs)
markdown_report = to_markdown(report)
# Output # Output
if args.output: if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
json.dump(report, f, indent=2) json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}") print(f"\nResults saved to {args.output}")
if args.markdown_output:
os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
with open(args.markdown_output, "w", encoding="utf-8") as f:
f.write(markdown_report)
print(f"Markdown report saved to {args.markdown_output}")
if args.markdown or not args.output: if args.markdown or not args.output:
print("\n" + to_markdown(report)) print("\n" + markdown_report)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,67 @@
{
"generated_at": "2026-04-22T16:21:56.271426+00:00",
"config": {
"total_images": 2,
"runs_per_model": 1,
"models": {
"gemma4": "Gemma 4 27B",
"gemini3_flash": "Gemini 3 Flash Preview"
}
},
"results": [
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_mark",
"category": "screenshot"
},
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_social",
"category": "screenshot"
}
],
"summary": {
"gemma4": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
]
},
"gemini3_flash": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
]
}
}
}

View File

@@ -0,0 +1,44 @@
# Vision Benchmark Report
Generated: 2026-04-22T16:21
Images tested: 2
Runs per model: 1
Models: Gemma 4 27B, Gemini 3 Flash Preview
## Latency Comparison
| Model | Mean (ms) | Median | P95 | Std Dev |
|-------|-----------|--------|-----|---------|
## Accuracy Comparison
| Model | OCR Accuracy | Keyword Coverage | Success Rate |
|-------|-------------|-----------------|--------------|
## Token Usage
| Model | Mean Tokens/Image | Total Tokens |
|-------|------------------|--------------|
## Failure Modes
### Gemma 4 27B
- Summary: All runs failed
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
### Gemini 3 Flash Preview
- Summary: All runs failed
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
## Verdict
Benchmark blocked or insufficient data for a trustworthy winner.
Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.