diff --git a/benchmarks/test_images.json b/benchmarks/test_images.json index 7c4c50a8a..f5af8d673 100644 --- a/benchmarks/test_images.json +++ b/benchmarks/test_images.json @@ -1,194 +1,757 @@ [ { - "id": "screenshot_github_home", + "id": "screenshot_github_mark", "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", "category": "screenshot", - "expected_keywords": ["github", "logo", "mark"], + "expected_keywords": [ + "github", + "logo", + "mark" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } }, { - "id": "diagram_mermaid_flow", - "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w", + "id": "screenshot_github_social", + "url": "https://github.githubassets.com/images/modules/site/social-cards.png", + "category": "screenshot", + "expected_keywords": [ + "github", + "page", + "web" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_github_code_search", + "url": "https://github.githubassets.com/images/modules/site/features-code-search.png", + "category": "screenshot", + "expected_keywords": [ + "search", + "code", + "feature" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_terminal_capture", + "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", + "category": "screenshot", + "expected_keywords": [ + "terminal", + "command", + "output" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_http_404", + "url": "https://http.cat/404.jpg", + "category": "screenshot", + "expected_keywords": [ + "404", + "error", + "cat" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_dummy_cli_01", + "url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", + "category": "screenshot", + "expected_keywords": [ + "hermes", + "cli", + "session" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_dummy_cli_02", + "url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", + "category": "screenshot", + "expected_keywords": [ + "prompt", + "cache", + "dashboard" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_dummy_ui_01", + "url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", + "category": "screenshot", + "expected_keywords": [ + "settings", + "voice", + "mode" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_dummy_ui_02", + "url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", + "category": "screenshot", + "expected_keywords": [ + "browser", + "vision", + "preview" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "screenshot_dummy_ui_03", + "url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", + "category": "screenshot", + "expected_keywords": [ + "tool", + "call", + "inspector" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "diagram_flow_a", + "url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", "category": "diagram", - "expected_keywords": ["flow", "diagram", "process"], + "expected_keywords": [ + "flowchart", + "api", + "worker" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { - "id": "photo_random_1", - "url": "https://picsum.photos/seed/vision1/400/300", - "category": "photo", - "expected_keywords": [], + "id": "diagram_flow_b", + "url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", + "category": "diagram", + "expected_keywords": [ + "architecture", + "diagram", + "cache" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { - "id": "photo_random_2", - "url": "https://picsum.photos/seed/vision2/400/300", - "category": "photo", - "expected_keywords": [], + "id": "diagram_uml_a", + "url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", + "category": "diagram", + "expected_keywords": [ + "class", + "diagram", + "session" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { - "id": "chart_simple_bar", - "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", - "category": "chart", - "expected_keywords": ["bar", "chart", "revenue"], + "id": "diagram_uml_b", + "url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", + "category": "diagram", + "expected_keywords": [ + "sequence", + "diagram", + "response" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { - "id": "chart_pie", - "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", - "category": "chart", - "expected_keywords": ["pie", "chart", "percentage"], + "id": "diagram_network_a", + "url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", + "category": "diagram", + "expected_keywords": [ + "network", + "node", + "router" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } + }, + { + "id": "diagram_network_b", + "url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", + "category": "diagram", + "expected_keywords": [ + "service", + "mesh", + "auth" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } + }, + { + "id": "diagram_state_machine", + "url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", + "category": "diagram", + "expected_keywords": [ + "state", + "machine", + "idle" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } + }, + { + "id": "diagram_mind_map", + "url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", + "category": "diagram", + "expected_keywords": [ + "mind", + "memory", + "tools" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } + }, + { + "id": "diagram_pipeline", + "url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", + "category": "diagram", + "expected_keywords": [ + "pipeline", + "ingest", + "summarize" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { "id": "diagram_org_chart", - "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg", + "url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", "category": "diagram", - "expected_keywords": ["organization", "hierarchy", "chart"], + "expected_keywords": [ + "org", + "chart", + "review" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": false + } }, { - "id": "screenshot_terminal", - "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", - "category": "screenshot", - "expected_keywords": ["terminal", "command", "output"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "photo_random_3", - "url": "https://picsum.photos/seed/vision3/400/300", + "id": "photo_random_01", + "url": "https://picsum.photos/seed/vision-bench-1/640/480", "category": "photo", "expected_keywords": [], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } }, { - "id": "chart_line", + "id": "photo_random_02", + "url": "https://picsum.photos/seed/vision-bench-2/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_03", + "url": "https://picsum.photos/seed/vision-bench-3/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_04", + "url": "https://picsum.photos/seed/vision-bench-4/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_05", + "url": "https://picsum.photos/seed/vision-bench-5/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_06", + "url": "https://picsum.photos/seed/vision-bench-6/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_07", + "url": "https://picsum.photos/seed/vision-bench-7/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_08", + "url": "https://picsum.photos/seed/vision-bench-8/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_09", + "url": "https://picsum.photos/seed/vision-bench-9/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "photo_random_10", + "url": "https://picsum.photos/seed/vision-bench-10/640/480", + "category": "photo", + "expected_keywords": [], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 30, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "chart_bar_quarterly", + "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", + "category": "chart", + "expected_keywords": [ + "bar", + "chart", + "revenue" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "chart_pie_market", + "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", + "category": "chart", + "expected_keywords": [ + "pie", + "chart", + "percentage" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "chart_line_temp", "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", "category": "chart", - "expected_keywords": ["line", "chart", "temperature"], + "expected_keywords": [ + "line", + "chart", + "temperature" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } }, { - "id": "diagram_sequence", - "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg", - "category": "diagram", - "expected_keywords": ["sequence", "interaction", "message"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} - }, - { - "id": "photo_random_4", - "url": "https://picsum.photos/seed/vision4/400/300", - "category": "photo", - "expected_keywords": [], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "screenshot_webpage", - "url": "https://github.githubassets.com/images/modules/site/social-cards.png", - "category": "screenshot", - "expected_keywords": ["github", "page", "web"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "chart_radar", + "id": "chart_radar_skill", "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", "category": "chart", - "expected_keywords": ["radar", "chart", "skill"], + "expected_keywords": [ + "radar", + "chart", + "skill" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } }, { - "id": "photo_random_5", - "url": "https://picsum.photos/seed/vision5/400/300", - "category": "photo", - "expected_keywords": [], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "diagram_class", - "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg", - "category": "diagram", - "expected_keywords": ["class", "object", "attribute"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} - }, - { - "id": "chart_doughnut", - "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", - "category": "chart", - "expected_keywords": ["doughnut", "chart", "device"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} - }, - { - "id": "photo_random_6", - "url": "https://picsum.photos/seed/vision6/400/300", - "category": "photo", - "expected_keywords": [], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "screenshot_error", - "url": "https://http.cat/404.jpg", - "category": "screenshot", - "expected_keywords": ["404", "error", "cat"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true} - }, - { - "id": "diagram_network", - "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg", - "category": "diagram", - "expected_keywords": ["network", "node", "connection"], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false} - }, - { - "id": "photo_random_7", - "url": "https://picsum.photos/seed/vision7/400/300", - "category": "photo", - "expected_keywords": [], - "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} - }, - { - "id": "chart_stacked_bar", + "id": "chart_stacked_cloud", "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", "category": "chart", - "expected_keywords": ["stacked", "bar", "chart"], + "expected_keywords": [ + "stacked", + "bar", + "chart" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } }, { - "id": "screenshot_dashboard", - "url": "https://github.githubassets.com/images/modules/site/features-code-search.png", - "category": "screenshot", - "expected_keywords": ["search", "code", "feature"], + "id": "chart_area_growth", + "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", + "category": "chart", + "expected_keywords": [ + "line", + "growth", + "chart" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } }, { - "id": "photo_random_8", - "url": "https://picsum.photos/seed/vision8/400/300", - "category": "photo", - "expected_keywords": [], + "id": "chart_scatter_eval", + "url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", + "category": "chart", + "expected_keywords": [ + "scatter", + "chart", + "runs" + ], "ground_truth_ocr": "", - "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false} + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "chart_horizontal_bar", + "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", + "category": "chart", + "expected_keywords": [ + "bar", + "score", + "ocr" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "chart_bubble_usage", + "url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", + "category": "chart", + "expected_keywords": [ + "bubble", + "latency", + "chart" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "chart_doughnut_devices", + "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", + "category": "chart", + "expected_keywords": [ + "doughnut", + "chart", + "device" + ], + "ground_truth_ocr": "", + "expected_structure": { + "min_length": 50, + "min_sentences": 2, + "has_numbers": true + } + }, + { + "id": "ocr_text_01", + "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01", + "category": "ocr", + "expected_keywords": [ + "hermes", + "ocr" + ], + "ground_truth_ocr": "Hermes OCR Alpha 01", + "expected_structure": { + "min_length": 10, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "ocr_text_02", + "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%", + "category": "ocr", + "expected_keywords": [ + "prompt", + "cache" + ], + "ground_truth_ocr": "Prompt Cache Hit 87%", + "expected_structure": { + "min_length": 10, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "ocr_text_03", + "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready", + "category": "ocr", + "expected_keywords": [ + "session", + "42" + ], + "ground_truth_ocr": "Session 42 Ready", + "expected_structure": { + "min_length": 10, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "ocr_text_04", + "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms", + "category": "ocr", + "expected_keywords": [ + "latency", + "118" + ], + "ground_truth_ocr": "Latency 118 ms", + "expected_structure": { + "min_length": 10, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "ocr_text_05", + "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled", + "category": "ocr", + "expected_keywords": [ + "voice", + "mode" + ], + "ground_truth_ocr": "Voice Mode Enabled", + "expected_structure": { + "min_length": 10, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "document_text_01", + "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22", + "category": "document", + "expected_keywords": [ + "invoice", + "1001", + "total" + ], + "ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22", + "expected_structure": { + "min_length": 20, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "document_text_02", + "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved", + "category": "document", + "expected_keywords": [ + "form", + "a", + "name" + ], + "ground_truth_ocr": "Form A Name Alice Status Approved", + "expected_structure": { + "min_length": 20, + "min_sentences": 1, + "has_numbers": false + } + }, + { + "id": "document_text_03", + "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent", + "category": "document", + "expected_keywords": [ + "report", + "memory", + "recall" + ], + "ground_truth_ocr": "Report Memory Recall Score 91 Percent", + "expected_structure": { + "min_length": 20, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "document_text_04", + "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now", + "category": "document", + "expected_keywords": [ + "checklist", + "crisis", + "escalation" + ], + "ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now", + "expected_structure": { + "min_length": 20, + "min_sentences": 1, + "has_numbers": true + } + }, + { + "id": "document_text_05", + "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending", + "category": "document", + "expected_keywords": [ + "meeting", + "notes", + "vision" + ], + "ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending", + "expected_structure": { + "min_length": 20, + "min_sentences": 1, + "has_numbers": false + } } -] +] \ No newline at end of file diff --git a/benchmarks/vision_benchmark.py b/benchmarks/vision_benchmark.py index e1d272f12..13d3ec757 100644 --- a/benchmarks/vision_benchmark.py +++ b/benchmarks/vision_benchmark.py @@ -22,10 +22,12 @@ import argparse import asyncio import base64 import json +import mimetypes import os import statistics import sys import time +import urllib.request from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional @@ -41,12 +43,16 @@ MODELS = { "model_id": "google/gemma-4-27b-it", "display_name": "Gemma 4 27B", "provider": "nous", + "fallback_provider": "ollama", + "fallback_model_id": "gemma4:latest", "description": "Google's multimodal Gemma 4 model", }, "gemini3_flash": { "model_id": "google/gemini-3-flash-preview", "display_name": "Gemini 3 Flash Preview", "provider": "openrouter", + "fallback_provider": "gemini", + "fallback_model_id": "gemini-2.5-flash", "description": "Current default vision model", }, } @@ -84,91 +90,150 @@ async def analyze_with_model( """ import httpx + def _load_image_bytes_cached() -> tuple[bytes, str]: + nonlocal _image_bytes, _mime_type + if _image_bytes is not None: + return _image_bytes, _mime_type + if image_url.startswith(("http://", "https://")): + with urllib.request.urlopen(image_url, timeout=30) as resp: + _image_bytes = resp.read() + _mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png" + else: + path = Path(image_url).expanduser() + _image_bytes = path.read_bytes() + _mime_type = mimetypes.guess_type(str(path))[0] or "image/png" + return _image_bytes, _mime_type + + def _data_url() -> str: + image_bytes, mime_type = _load_image_bytes_cached() + return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}" + + def _provider_key(provider: str) -> str: + if provider == "openrouter": + return os.getenv("OPENROUTER_API_KEY", "") + if provider == "nous": + return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "") + if provider == "gemini": + return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "") + return os.getenv(f"{provider.upper()}_API_KEY", "") + provider = model_config["provider"] model_id = model_config["model_id"] + candidates = [(provider, model_id)] + if model_config.get("fallback_provider") and model_config.get("fallback_model_id"): + candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"])) - # Prepare messages - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - } - ] + _image_bytes: Optional[bytes] = None + _mime_type = "image/png" + failures = [] - # Route to provider - if provider == "openrouter": - api_url = "https://openrouter.ai/api/v1/chat/completions" - api_key = os.getenv("OPENROUTER_API_KEY", "") - elif provider == "nous": - api_url = "https://inference.nousresearch.com/v1/chat/completions" - api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "") - else: - api_url = os.getenv(f"{provider.upper()}_API_URL", "") - api_key = os.getenv(f"{provider.upper()}_API_KEY", "") + for candidate_provider, candidate_model in candidates: + api_key = _provider_key(candidate_provider) + start = time.perf_counter() + try: + if candidate_provider in {"openrouter", "nous"}: + api_url = ( + "https://openrouter.ai/api/v1/chat/completions" + if candidate_provider == "openrouter" + else "https://inference.nousresearch.com/v1/chat/completions" + ) + if not api_key: + raise RuntimeError(f"No API key for provider {candidate_provider}") + payload = { + "model": candidate_model, + "messages": [{ + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}}, + ], + }], + "max_tokens": 2000, + "temperature": 0.1, + } + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(api_url, json=payload, headers=headers) + resp.raise_for_status() + data = resp.json() + analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + tokens = { + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + } + elif candidate_provider == "gemini": + if not api_key: + raise RuntimeError("No API key for provider gemini") + image_bytes, mime_type = _load_image_bytes_cached() + api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}" + payload = { + "contents": [{"parts": [ + {"text": prompt}, + {"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}}, + ]}], + "generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000}, + } + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(api_url, json=payload) + resp.raise_for_status() + data = resp.json() + parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", []) + analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text")) + usage = data.get("usageMetadata", {}) + tokens = { + "prompt_tokens": usage.get("promptTokenCount", 0), + "completion_tokens": usage.get("candidatesTokenCount", 0), + "total_tokens": usage.get("totalTokenCount", 0), + } + elif candidate_provider == "ollama": + image_bytes, _ = _load_image_bytes_cached() + payload = { + "model": candidate_model, + "stream": False, + "messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}], + "options": {"temperature": 0.1}, + } + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post("http://localhost:11434/api/chat", json=payload) + resp.raise_for_status() + data = resp.json() + analysis = data.get("message", {}).get("content", "") + tokens = { + "prompt_tokens": data.get("prompt_eval_count", 0), + "completion_tokens": data.get("eval_count", 0), + "total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0), + } + else: + raise RuntimeError(f"Unsupported provider {candidate_provider}") - if not api_key: - return { - "analysis": "", - "latency_ms": 0, - "tokens": {}, - "success": False, - "error": f"No API key for provider {provider}", - } + latency_ms = (time.perf_counter() - start) * 1000 + return { + "analysis": analysis, + "latency_ms": round(latency_ms, 1), + "tokens": tokens, + "success": True, + "error": "", + "provider_used": candidate_provider, + "model_used": candidate_model, + } + except Exception as e: + failures.append(f"{candidate_provider}:{candidate_model} => {e}") - headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json", + return { + "analysis": "", + "latency_ms": 0, + "tokens": {}, + "success": False, + "error": " | ".join(failures) if failures else "No runs", + "provider_used": candidates[-1][0] if candidates else provider, + "model_used": candidates[-1][1] if candidates else model_id, } - payload = { - "model": model_id, - "messages": messages, - "max_tokens": 2000, - "temperature": 0.1, - } - - start = time.perf_counter() - try: - async with httpx.AsyncClient(timeout=timeout) as client: - resp = await client.post(api_url, json=payload, headers=headers) - resp.raise_for_status() - data = resp.json() - - latency_ms = (time.perf_counter() - start) * 1000 - - analysis = "" - choices = data.get("choices", []) - if choices: - msg = choices[0].get("message", {}) - analysis = msg.get("content", "") - - usage = data.get("usage", {}) - tokens = { - "prompt_tokens": usage.get("prompt_tokens", 0), - "completion_tokens": usage.get("completion_tokens", 0), - "total_tokens": usage.get("total_tokens", 0), - } - - return { - "analysis": analysis, - "latency_ms": round(latency_ms, 1), - "tokens": tokens, - "success": True, - "error": "", - } - - except Exception as e: - return { - "analysis": "", - "latency_ms": round((time.perf_counter() - start) * 1000, 1), - "tokens": {}, - "success": False, - "error": str(e), - } - # --------------------------------------------------------------------------- # Evaluation metrics @@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict: failed = [r[model_name] for r in results if not r[model_name]["success"]] if not model_results: - summary[model_name] = {"success_rate": 0, "error": "All runs failed"} + summary[model_name] = { + "success_rate": 0, + "error": "All runs failed", + "total_runs": 0, + "total_failures": len(failed), + "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3], + } continue latencies = [r["avg_latency_ms"] for r in model_results] @@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict: "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4), "total_runs": len(model_results), "total_failures": len(failed), + "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3], "latency": { "mean_ms": round(statistics.mean(latencies), 1), "median_ms": round(statistics.median(latencies), 1), @@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str: f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |" ) + lines += ["", "## Failure Modes", ""] + had_failures = False + for mkey, mname in config["models"].items(): + model_summary = summary.get(mkey, {}) + failure_examples = model_summary.get("failure_examples", []) + if not failure_examples and not model_summary.get("error"): + continue + had_failures = True + lines.append(f"### {mname}") + if model_summary.get("error"): + lines.append(f"- Summary: {model_summary['error']}") + for err in failure_examples: + lines.append(f"- {err}") + lines.append("") + if not had_failures: + lines.append("- No provider/runtime failures recorded.") + # Verdict lines += ["", "## Verdict", ""] @@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str: if best_model: lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})") + lines.append("") + lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.") else: - lines.append("No clear winner — insufficient data.") + lines.append("Benchmark blocked or insufficient data for a trustworthy winner.") + lines.append("") + lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.") return "\n".join(lines) @@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str: def generate_sample_dataset() -> List[dict]: - """Generate a sample test dataset with diverse public images. + """Generate a larger benchmark dataset aligned with issue #817. - Returns list of test image definitions. + Returns 50+ images across screenshots, diagrams, photos, OCR, charts, + and document-like images so the harness matches the issue contract. """ - return [ - # Screenshots - { - "id": "screenshot_github", - "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", + dataset: List[dict] = [] + + screenshots = [ + ("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]), + ("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]), + ("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]), + ("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]), + ("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]), + ("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]), + ("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]), + ("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]), + ("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]), + ("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]), + ] + for ident, url, keywords in screenshots: + dataset.append({ + "id": f"screenshot_{ident}", + "url": url, "category": "screenshot", - "expected_keywords": ["github", "logo", "octocat"], - "expected_structure": {"min_length": 50, "min_sentences": 2}, - }, - # Diagrams - { - "id": "diagram_architecture", - "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw", + "expected_keywords": keywords, + "ground_truth_ocr": "", + "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False}, + }) + + diagrams = [ + ("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]), + ("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]), + ("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]), + ("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]), + ("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]), + ("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]), + ("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]), + ("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]), + ("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]), + ("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]), + ] + for ident, url, keywords in diagrams: + dataset.append({ + "id": f"diagram_{ident}", + "url": url, "category": "diagram", - "expected_keywords": ["architecture", "component", "service"], - "expected_structure": {"min_length": 100, "min_sentences": 3}, - }, - # Photos - { - "id": "photo_nature", - "url": "https://picsum.photos/seed/bench1/400/300", + "expected_keywords": keywords, + "ground_truth_ocr": "", + "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False}, + }) + + for idx in range(1, 11): + dataset.append({ + "id": f"photo_random_{idx:02d}", + "url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480", "category": "photo", "expected_keywords": [], - "expected_structure": {"min_length": 30, "min_sentences": 1}, - }, - # Charts - { - "id": "chart_bar", - "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}", - "category": "chart", - "expected_keywords": ["bar", "chart", "data"], - "expected_structure": {"min_length": 50, "min_sentences": 2}, - }, + "ground_truth_ocr": "", + "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False}, + }) + + charts = [ + ("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]), + ("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]), + ("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]), + ("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]), + ("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]), + ("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]), + ("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]), + ("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]), + ("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]), + ("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]), ] + for ident, url, keywords in charts: + dataset.append({ + "id": f"chart_{ident}", + "url": url, + "category": "chart", + "expected_keywords": keywords, + "ground_truth_ocr": "", + "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True}, + }) + + ocr_texts = [ + "Hermes OCR Alpha 01", + "Prompt Cache Hit 87%", + "Session 42 Ready", + "Latency 118 ms", + "Voice Mode Enabled", + ] + for idx, text in enumerate(ocr_texts, start=1): + dataset.append({ + "id": f"ocr_text_{idx:02d}", + "url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}", + "category": "ocr", + "expected_keywords": text.lower().split()[:2], + "ground_truth_ocr": text, + "expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)}, + }) + + documents = [ + "Invoice 1001 Total 42 Due 2026-04-22", + "Form A Name Alice Status Approved", + "Report Memory Recall Score 91 Percent", + "Checklist Crisis Escalation Call 988 Now", + "Meeting Notes Vision Benchmark Run Pending", + ] + for idx, text in enumerate(documents, start=1): + dataset.append({ + "id": f"document_text_{idx:02d}", + "url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}", + "category": "document", + "expected_keywords": text.lower().split()[:3], + "ground_truth_ocr": text, + "expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)}, + }) + + return dataset def load_dataset(path: str) -> List[dict]: @@ -585,7 +758,9 @@ async def main(): parser.add_argument("--url", help="Single image URL to test") parser.add_argument("--category", default="photo", help="Category for single URL") parser.add_argument("--output", default=None, help="Output JSON file") + parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path") parser.add_argument("--runs", type=int, default=1, help="Runs per model per image") + parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs") parser.add_argument("--models", nargs="+", default=None, help="Models to test (default: all)") parser.add_argument("--markdown", action="store_true", help="Output markdown report") @@ -617,9 +792,14 @@ async def main(): print("ERROR: Provide --images or --url") sys.exit(1) + if args.limit and args.limit > 0: + images = images[:args.limit] + # Run benchmark report = await run_benchmark_suite(images, selected, args.runs) + markdown_report = to_markdown(report) + # Output if args.output: os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) @@ -627,8 +807,14 @@ async def main(): json.dump(report, f, indent=2) print(f"\nResults saved to {args.output}") + if args.markdown_output: + os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True) + with open(args.markdown_output, "w", encoding="utf-8") as f: + f.write(markdown_report) + print(f"Markdown report saved to {args.markdown_output}") + if args.markdown or not args.output: - print("\n" + to_markdown(report)) + print("\n" + markdown_report) if __name__ == "__main__": diff --git a/metrics/vision-benchmark-smoke-2026-04-22.json b/metrics/vision-benchmark-smoke-2026-04-22.json new file mode 100644 index 000000000..0838c1944 --- /dev/null +++ b/metrics/vision-benchmark-smoke-2026-04-22.json @@ -0,0 +1,67 @@ +{ + "generated_at": "2026-04-22T16:21:56.271426+00:00", + "config": { + "total_images": 2, + "runs_per_model": 1, + "models": { + "gemma4": "Gemma 4 27B", + "gemini3_flash": "Gemini 3 Flash Preview" + } + }, + "results": [ + { + "gemma4": { + "success": false, + "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500", + "runs": 0, + "errors": 1 + }, + "gemini3_flash": { + "success": false, + "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429", + "runs": 0, + "errors": 1 + }, + "image_id": "screenshot_github_mark", + "category": "screenshot" + }, + { + "gemma4": { + "success": false, + "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found", + "runs": 0, + "errors": 1 + }, + "gemini3_flash": { + "success": false, + "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found", + "runs": 0, + "errors": 1 + }, + "image_id": "screenshot_github_social", + "category": "screenshot" + } + ], + "summary": { + "gemma4": { + "success_rate": 0, + "error": "All runs failed", + "total_runs": 0, + "total_failures": 2, + "failure_examples": [ + "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found", + "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500" + ] + }, + "gemini3_flash": { + "success_rate": 0, + "error": "All runs failed", + "total_runs": 0, + "total_failures": 2, + "failure_examples": [ + "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429", + "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found" + ] + } + } +} \ No newline at end of file diff --git a/metrics/vision-benchmark-smoke-2026-04-22.md b/metrics/vision-benchmark-smoke-2026-04-22.md new file mode 100644 index 000000000..eee8efb40 --- /dev/null +++ b/metrics/vision-benchmark-smoke-2026-04-22.md @@ -0,0 +1,44 @@ +# Vision Benchmark Report + +Generated: 2026-04-22T16:21 +Images tested: 2 +Runs per model: 1 +Models: Gemma 4 27B, Gemini 3 Flash Preview + +## Latency Comparison + +| Model | Mean (ms) | Median | P95 | Std Dev | +|-------|-----------|--------|-----|---------| + +## Accuracy Comparison + +| Model | OCR Accuracy | Keyword Coverage | Success Rate | +|-------|-------------|-----------------|--------------| + +## Token Usage + +| Model | Mean Tokens/Image | Total Tokens | +|-------|------------------|--------------| + +## Failure Modes + +### Gemma 4 27B +- Summary: All runs failed +- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found +- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat' +For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500 + +### Gemini 3 Flash Preview +- Summary: All runs failed +- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions' +For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0' +For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429 +- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions' +For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found + + +## Verdict + +Benchmark blocked or insufficient data for a trustworthy winner. + +Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist. \ No newline at end of file