feat: harden vision benchmark artifacts

Refs #817
wip: tighten vision benchmark acceptance tests
2026-04-22 12:22:28 -04:00 · 2026-04-22 12:10:23 -04:00 · 2026-04-22 12:07:52 -04:00
12 changed files with 1147 additions and 689 deletions
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -1,194 +1,757 @@
 [
  {
-    "id": "screenshot_github_home",
+    "id": "screenshot_github_mark",
    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "logo", "mark"],
+    "expected_keywords": [
+      "github",
+      "logo",
+      "mark"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
-    "id": "diagram_mermaid_flow",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
+    "id": "screenshot_github_social",
+    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "github",
+      "page",
+      "web"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_github_code_search",
+    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "search",
+      "code",
+      "feature"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_terminal_capture",
+    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "terminal",
+      "command",
+      "output"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_http_404",
+    "url": "https://http.cat/404.jpg",
+    "category": "screenshot",
+    "expected_keywords": [
+      "404",
+      "error",
+      "cat"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_cli_01",
+    "url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
+    "category": "screenshot",
+    "expected_keywords": [
+      "hermes",
+      "cli",
+      "session"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_cli_02",
+    "url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
+    "category": "screenshot",
+    "expected_keywords": [
+      "prompt",
+      "cache",
+      "dashboard"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_01",
+    "url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
+    "category": "screenshot",
+    "expected_keywords": [
+      "settings",
+      "voice",
+      "mode"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_02",
+    "url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
+    "category": "screenshot",
+    "expected_keywords": [
+      "browser",
+      "vision",
+      "preview"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_03",
+    "url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
+    "category": "screenshot",
+    "expected_keywords": [
+      "tool",
+      "call",
+      "inspector"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_flow_a",
+    "url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
    "category": "diagram",
-    "expected_keywords": ["flow", "diagram", "process"],
+    "expected_keywords": [
+      "flowchart",
+      "api",
+      "worker"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "photo_random_1",
-    "url": "https://picsum.photos/seed/vision1/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "diagram_flow_b",
+    "url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
+    "category": "diagram",
+    "expected_keywords": [
+      "architecture",
+      "diagram",
+      "cache"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "photo_random_2",
-    "url": "https://picsum.photos/seed/vision2/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "diagram_uml_a",
+    "url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
+    "category": "diagram",
+    "expected_keywords": [
+      "class",
+      "diagram",
+      "session"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_simple_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
-    "category": "chart",
-    "expected_keywords": ["bar", "chart", "revenue"],
+    "id": "diagram_uml_b",
+    "url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
+    "category": "diagram",
+    "expected_keywords": [
+      "sequence",
+      "diagram",
+      "response"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_pie",
-    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
-    "category": "chart",
-    "expected_keywords": ["pie", "chart", "percentage"],
+    "id": "diagram_network_a",
+    "url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
+    "category": "diagram",
+    "expected_keywords": [
+      "network",
+      "node",
+      "router"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_network_b",
+    "url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
+    "category": "diagram",
+    "expected_keywords": [
+      "service",
+      "mesh",
+      "auth"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_state_machine",
+    "url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
+    "category": "diagram",
+    "expected_keywords": [
+      "state",
+      "machine",
+      "idle"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_mind_map",
+    "url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
+    "category": "diagram",
+    "expected_keywords": [
+      "mind",
+      "memory",
+      "tools"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_pipeline",
+    "url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
+    "category": "diagram",
+    "expected_keywords": [
+      "pipeline",
+      "ingest",
+      "summarize"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_org_chart",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
    "category": "diagram",
-    "expected_keywords": ["organization", "hierarchy", "chart"],
+    "expected_keywords": [
+      "org",
+      "chart",
+      "review"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "screenshot_terminal",
-    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
-    "category": "screenshot",
-    "expected_keywords": ["terminal", "command", "output"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_3",
-    "url": "https://picsum.photos/seed/vision3/400/300",
+    "id": "photo_random_01",
+    "url": "https://picsum.photos/seed/vision-bench-1/640/480",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_line",
+    "id": "photo_random_02",
+    "url": "https://picsum.photos/seed/vision-bench-2/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_03",
+    "url": "https://picsum.photos/seed/vision-bench-3/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_04",
+    "url": "https://picsum.photos/seed/vision-bench-4/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_05",
+    "url": "https://picsum.photos/seed/vision-bench-5/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_06",
+    "url": "https://picsum.photos/seed/vision-bench-6/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_07",
+    "url": "https://picsum.photos/seed/vision-bench-7/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_08",
+    "url": "https://picsum.photos/seed/vision-bench-8/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_09",
+    "url": "https://picsum.photos/seed/vision-bench-9/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_10",
+    "url": "https://picsum.photos/seed/vision-bench-10/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "chart_bar_quarterly",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bar",
+      "chart",
+      "revenue"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_pie_market",
+    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "pie",
+      "chart",
+      "percentage"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_line_temp",
    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
    "category": "chart",
-    "expected_keywords": ["line", "chart", "temperature"],
+    "expected_keywords": [
+      "line",
+      "chart",
+      "temperature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "diagram_sequence",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["sequence", "interaction", "message"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_4",
-    "url": "https://picsum.photos/seed/vision4/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_webpage",
-    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
-    "category": "screenshot",
-    "expected_keywords": ["github", "page", "web"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_radar",
+    "id": "chart_radar_skill",
    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
    "category": "chart",
-    "expected_keywords": ["radar", "chart", "skill"],
+    "expected_keywords": [
+      "radar",
+      "chart",
+      "skill"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "photo_random_5",
-    "url": "https://picsum.photos/seed/vision5/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "diagram_class",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["class", "object", "attribute"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "chart_doughnut",
-    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
-    "category": "chart",
-    "expected_keywords": ["doughnut", "chart", "device"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "photo_random_6",
-    "url": "https://picsum.photos/seed/vision6/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_error",
-    "url": "https://http.cat/404.jpg",
-    "category": "screenshot",
-    "expected_keywords": ["404", "error", "cat"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
-  },
-  {
-    "id": "diagram_network",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["network", "node", "connection"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_7",
-    "url": "https://picsum.photos/seed/vision7/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_stacked_bar",
+    "id": "chart_stacked_cloud",
    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
    "category": "chart",
-    "expected_keywords": ["stacked", "bar", "chart"],
+    "expected_keywords": [
+      "stacked",
+      "bar",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "screenshot_dashboard",
-    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
-    "category": "screenshot",
-    "expected_keywords": ["search", "code", "feature"],
+    "id": "chart_area_growth",
+    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "line",
+      "growth",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "photo_random_8",
-    "url": "https://picsum.photos/seed/vision8/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "chart_scatter_eval",
+    "url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "scatter",
+      "chart",
+      "runs"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_horizontal_bar",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bar",
+      "score",
+      "ocr"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_bubble_usage",
+    "url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bubble",
+      "latency",
+      "chart"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_doughnut_devices",
+    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "doughnut",
+      "chart",
+      "device"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_01",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
+    "category": "ocr",
+    "expected_keywords": [
+      "hermes",
+      "ocr"
+    ],
+    "ground_truth_ocr": "Hermes OCR Alpha 01",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_02",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
+    "category": "ocr",
+    "expected_keywords": [
+      "prompt",
+      "cache"
+    ],
+    "ground_truth_ocr": "Prompt Cache Hit 87%",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_03",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
+    "category": "ocr",
+    "expected_keywords": [
+      "session",
+      "42"
+    ],
+    "ground_truth_ocr": "Session 42 Ready",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_04",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
+    "category": "ocr",
+    "expected_keywords": [
+      "latency",
+      "118"
+    ],
+    "ground_truth_ocr": "Latency 118 ms",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_05",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
+    "category": "ocr",
+    "expected_keywords": [
+      "voice",
+      "mode"
+    ],
+    "ground_truth_ocr": "Voice Mode Enabled",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "document_text_01",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
+    "category": "document",
+    "expected_keywords": [
+      "invoice",
+      "1001",
+      "total"
+    ],
+    "ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_02",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
+    "category": "document",
+    "expected_keywords": [
+      "form",
+      "a",
+      "name"
+    ],
+    "ground_truth_ocr": "Form A Name Alice Status Approved",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "document_text_03",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
+    "category": "document",
+    "expected_keywords": [
+      "report",
+      "memory",
+      "recall"
+    ],
+    "ground_truth_ocr": "Report Memory Recall Score 91 Percent",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_04",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
+    "category": "document",
+    "expected_keywords": [
+      "checklist",
+      "crisis",
+      "escalation"
+    ],
+    "ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_05",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
+    "category": "document",
+    "expected_keywords": [
+      "meeting",
+      "notes",
+      "vision"
+    ],
+    "ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  }
-]
+]
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -22,10 +22,12 @@ import argparse
 import asyncio
 import base64
 import json
+import mimetypes
 import os
 import statistics
 import sys
 import time
+import urllib.request
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
        "model_id": "google/gemma-4-27b-it",
        "display_name": "Gemma 4 27B",
        "provider": "nous",
+        "fallback_provider": "ollama",
+        "fallback_model_id": "gemma4:latest",
        "description": "Google's multimodal Gemma 4 model",
    },
    "gemini3_flash": {
        "model_id": "google/gemini-3-flash-preview",
        "display_name": "Gemini 3 Flash Preview",
        "provider": "openrouter",
+        "fallback_provider": "gemini",
+        "fallback_model_id": "gemini-2.5-flash",
        "description": "Current default vision model",
    },
 }
@@ -84,91 +90,150 @@ async def analyze_with_model(
    """
    import httpx

+    def _load_image_bytes_cached() -> tuple[bytes, str]:
+        nonlocal _image_bytes, _mime_type
+        if _image_bytes is not None:
+            return _image_bytes, _mime_type
+        if image_url.startswith(("http://", "https://")):
+            with urllib.request.urlopen(image_url, timeout=30) as resp:
+                _image_bytes = resp.read()
+                _mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
+        else:
+            path = Path(image_url).expanduser()
+            _image_bytes = path.read_bytes()
+            _mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
+        return _image_bytes, _mime_type
+
+    def _data_url() -> str:
+        image_bytes, mime_type = _load_image_bytes_cached()
+        return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
+
+    def _provider_key(provider: str) -> str:
+        if provider == "openrouter":
+            return os.getenv("OPENROUTER_API_KEY", "")
+        if provider == "nous":
+            return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
+        if provider == "gemini":
+            return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
+        return os.getenv(f"{provider.upper()}_API_KEY", "")
+
    provider = model_config["provider"]
    model_id = model_config["model_id"]
+    candidates = [(provider, model_id)]
+    if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
+        candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))

-    # Prepare messages
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        }
-    ]
+    _image_bytes: Optional[bytes] = None
+    _mime_type = "image/png"
+    failures = []

-    # Route to provider
-    if provider == "openrouter":
-        api_url = "https://openrouter.ai/api/v1/chat/completions"
-        api_key = os.getenv("OPENROUTER_API_KEY", "")
-    elif provider == "nous":
-        api_url = "https://inference.nousresearch.com/v1/chat/completions"
-        api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
-    else:
-        api_url = os.getenv(f"{provider.upper()}_API_URL", "")
-        api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
+    for candidate_provider, candidate_model in candidates:
+        api_key = _provider_key(candidate_provider)
+        start = time.perf_counter()
+        try:
+            if candidate_provider in {"openrouter", "nous"}:
+                api_url = (
+                    "https://openrouter.ai/api/v1/chat/completions"
+                    if candidate_provider == "openrouter"
+                    else "https://inference.nousresearch.com/v1/chat/completions"
+                )
+                if not api_key:
+                    raise RuntimeError(f"No API key for provider {candidate_provider}")
+                payload = {
+                    "model": candidate_model,
+                    "messages": [{
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
+                        ],
+                    }],
+                    "max_tokens": 2000,
+                    "temperature": 0.1,
+                }
+                headers = {
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post(api_url, json=payload, headers=headers)
+                    resp.raise_for_status()
+                    data = resp.json()
+                analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+                usage = data.get("usage", {})
+                tokens = {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+            elif candidate_provider == "gemini":
+                if not api_key:
+                    raise RuntimeError("No API key for provider gemini")
+                image_bytes, mime_type = _load_image_bytes_cached()
+                api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
+                payload = {
+                    "contents": [{"parts": [
+                        {"text": prompt},
+                        {"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
+                    ]}],
+                    "generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post(api_url, json=payload)
+                    resp.raise_for_status()
+                    data = resp.json()
+                parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
+                analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
+                usage = data.get("usageMetadata", {})
+                tokens = {
+                    "prompt_tokens": usage.get("promptTokenCount", 0),
+                    "completion_tokens": usage.get("candidatesTokenCount", 0),
+                    "total_tokens": usage.get("totalTokenCount", 0),
+                }
+            elif candidate_provider == "ollama":
+                image_bytes, _ = _load_image_bytes_cached()
+                payload = {
+                    "model": candidate_model,
+                    "stream": False,
+                    "messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
+                    "options": {"temperature": 0.1},
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post("http://localhost:11434/api/chat", json=payload)
+                    resp.raise_for_status()
+                    data = resp.json()
+                analysis = data.get("message", {}).get("content", "")
+                tokens = {
+                    "prompt_tokens": data.get("prompt_eval_count", 0),
+                    "completion_tokens": data.get("eval_count", 0),
+                    "total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
+                }
+            else:
+                raise RuntimeError(f"Unsupported provider {candidate_provider}")

-    if not api_key:
-        return {
-            "analysis": "",
-            "latency_ms": 0,
-            "tokens": {},
-            "success": False,
-            "error": f"No API key for provider {provider}",
-        }
+            latency_ms = (time.perf_counter() - start) * 1000
+            return {
+                "analysis": analysis,
+                "latency_ms": round(latency_ms, 1),
+                "tokens": tokens,
+                "success": True,
+                "error": "",
+                "provider_used": candidate_provider,
+                "model_used": candidate_model,
+            }
+        except Exception as e:
+            failures.append(f"{candidate_provider}:{candidate_model} => {e}")

-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
+    return {
+        "analysis": "",
+        "latency_ms": 0,
+        "tokens": {},
+        "success": False,
+        "error": " | ".join(failures) if failures else "No runs",
+        "provider_used": candidates[-1][0] if candidates else provider,
+        "model_used": candidates[-1][1] if candidates else model_id,
    }

-    payload = {
-        "model": model_id,
-        "messages": messages,
-        "max_tokens": 2000,
-        "temperature": 0.1,
-    }
-
-    start = time.perf_counter()
-    try:
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            resp = await client.post(api_url, json=payload, headers=headers)
-            resp.raise_for_status()
-            data = resp.json()
-
-        latency_ms = (time.perf_counter() - start) * 1000
-
-        analysis = ""
-        choices = data.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            analysis = msg.get("content", "")
-
-        usage = data.get("usage", {})
-        tokens = {
-            "prompt_tokens": usage.get("prompt_tokens", 0),
-            "completion_tokens": usage.get("completion_tokens", 0),
-            "total_tokens": usage.get("total_tokens", 0),
-        }
-
-        return {
-            "analysis": analysis,
-            "latency_ms": round(latency_ms, 1),
-            "tokens": tokens,
-            "success": True,
-            "error": "",
-        }
-
-    except Exception as e:
-        return {
-            "analysis": "",
-            "latency_ms": round((time.perf_counter() - start) * 1000, 1),
-            "tokens": {},
-            "success": False,
-            "error": str(e),
-        }
-

 # ---------------------------------------------------------------------------
 # Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
        failed = [r[model_name] for r in results if not r[model_name]["success"]]

        if not model_results:
-            summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
+            summary[model_name] = {
+                "success_rate": 0,
+                "error": "All runs failed",
+                "total_runs": 0,
+                "total_failures": len(failed),
+                "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
+            }
            continue

        latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
            "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
            "total_runs": len(model_results),
            "total_failures": len(failed),
+            "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
            "latency": {
                "mean_ms": round(statistics.mean(latencies), 1),
                "median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
                f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
            )

+    lines += ["", "## Failure Modes", ""]
+    had_failures = False
+    for mkey, mname in config["models"].items():
+        model_summary = summary.get(mkey, {})
+        failure_examples = model_summary.get("failure_examples", [])
+        if not failure_examples and not model_summary.get("error"):
+            continue
+        had_failures = True
+        lines.append(f"### {mname}")
+        if model_summary.get("error"):
+            lines.append(f"- Summary: {model_summary['error']}")
+        for err in failure_examples:
+            lines.append(f"- {err}")
+        lines.append("")
+    if not had_failures:
+        lines.append("- No provider/runtime failures recorded.")
+
    # Verdict
    lines += ["", "## Verdict", ""]

@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:

    if best_model:
        lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
+        lines.append("")
+        lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
    else:
-        lines.append("No clear winner — insufficient data.")
+        lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
+        lines.append("")
+        lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")

    return "\n".join(lines)

@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:


 def generate_sample_dataset() -> List[dict]:
-    """Generate a sample test dataset with diverse public images.
+    """Generate a larger benchmark dataset aligned with issue #817.

-    Returns list of test image definitions.
+    Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
+    and document-like images so the harness matches the issue contract.
    """
-    return [
-        # Screenshots
-        {
-            "id": "screenshot_github",
-            "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+    dataset: List[dict] = []
+
+    screenshots = [
+        ("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
+        ("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
+        ("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
+        ("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
+        ("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
+        ("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
+        ("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
+        ("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
+        ("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
+        ("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
+    ]
+    for ident, url, keywords in screenshots:
+        dataset.append({
+            "id": f"screenshot_{ident}",
+            "url": url,
            "category": "screenshot",
-            "expected_keywords": ["github", "logo", "octocat"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
-        # Diagrams
-        {
-            "id": "diagram_architecture",
-            "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
+        })
+
+    diagrams = [
+        ("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
+        ("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
+        ("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
+        ("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
+        ("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
+        ("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
+        ("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
+        ("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
+        ("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
+        ("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
+    ]
+    for ident, url, keywords in diagrams:
+        dataset.append({
+            "id": f"diagram_{ident}",
+            "url": url,
            "category": "diagram",
-            "expected_keywords": ["architecture", "component", "service"],
-            "expected_structure": {"min_length": 100, "min_sentences": 3},
-        },
-        # Photos
-        {
-            "id": "photo_nature",
-            "url": "https://picsum.photos/seed/bench1/400/300",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
+        })
+
+    for idx in range(1, 11):
+        dataset.append({
+            "id": f"photo_random_{idx:02d}",
+            "url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
            "category": "photo",
            "expected_keywords": [],
-            "expected_structure": {"min_length": 30, "min_sentences": 1},
-        },
-        # Charts
-        {
-            "id": "chart_bar",
-            "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
-            "category": "chart",
-            "expected_keywords": ["bar", "chart", "data"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
+        })
+
+    charts = [
+        ("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
+        ("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
+        ("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
+        ("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
+        ("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
+        ("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
+        ("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
+        ("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
+        ("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
+        ("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
    ]
+    for ident, url, keywords in charts:
+        dataset.append({
+            "id": f"chart_{ident}",
+            "url": url,
+            "category": "chart",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
+        })
+
+    ocr_texts = [
+        "Hermes OCR Alpha 01",
+        "Prompt Cache Hit 87%",
+        "Session 42 Ready",
+        "Latency 118 ms",
+        "Voice Mode Enabled",
+    ]
+    for idx, text in enumerate(ocr_texts, start=1):
+        dataset.append({
+            "id": f"ocr_text_{idx:02d}",
+            "url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
+            "category": "ocr",
+            "expected_keywords": text.lower().split()[:2],
+            "ground_truth_ocr": text,
+            "expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
+        })
+
+    documents = [
+        "Invoice 1001 Total 42 Due 2026-04-22",
+        "Form A Name Alice Status Approved",
+        "Report Memory Recall Score 91 Percent",
+        "Checklist Crisis Escalation Call 988 Now",
+        "Meeting Notes Vision Benchmark Run Pending",
+    ]
+    for idx, text in enumerate(documents, start=1):
+        dataset.append({
+            "id": f"document_text_{idx:02d}",
+            "url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
+            "category": "document",
+            "expected_keywords": text.lower().split()[:3],
+            "ground_truth_ocr": text,
+            "expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
+        })
+
+    return dataset


 def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
    parser.add_argument("--url", help="Single image URL to test")
    parser.add_argument("--category", default="photo", help="Category for single URL")
    parser.add_argument("--output", default=None, help="Output JSON file")
+    parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
+    parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
    parser.add_argument("--models", nargs="+", default=None,
                        help="Models to test (default: all)")
    parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
        print("ERROR: Provide --images or --url")
        sys.exit(1)

+    if args.limit and args.limit > 0:
+        images = images[:args.limit]
+
    # Run benchmark
    report = await run_benchmark_suite(images, selected, args.runs)

+    markdown_report = to_markdown(report)
+
    # Output
    if args.output:
        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
            json.dump(report, f, indent=2)
        print(f"\nResults saved to {args.output}")

+    if args.markdown_output:
+        os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
+        with open(args.markdown_output, "w", encoding="utf-8") as f:
+            f.write(markdown_report)
+        print(f"Markdown report saved to {args.markdown_output}")
+
    if args.markdown or not args.output:
-        print("\n" + to_markdown(report))
+        print("\n" + markdown_report)


 if __name__ == "__main__":
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -523,7 +523,7 @@ DEFAULT_CONFIG = {
    
    # Text-to-speech configuration
    "tts": {
-        "provider": "edge",  # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local) | "kittentts" (local)
+        "provider": "edge",  # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
        "edge": {
            "voice": "en-US-AriaNeural",
            # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@@ -547,12 +547,6 @@ DEFAULT_CONFIG = {
            "model": "neuphonic/neutts-air-q4-gguf",  # HuggingFace model repo
            "device": "cpu",  # cpu, cuda, or mps
        },
-        "kittentts": {
-            "model": "KittenML/kitten-tts-nano-0.8-int8",  # 25MB int8 default
-            "voice": "Jasper",  # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
-            "speed": 1.0,
-            "clean_text": True,
-        },
    },
    
    "stt": {
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -443,16 +443,6 @@ def _print_setup_summary(config: dict, hermes_home):
            tool_status.append(("Text-to-Speech (NeuTTS local)", True, None))
        else:
            tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'"))
-    elif tts_provider == "kittentts":
-        try:
-            import importlib.util
-            kittentts_ok = importlib.util.find_spec("kittentts") is not None
-        except Exception:
-            kittentts_ok = False
-        if kittentts_ok:
-            tool_status.append(("Text-to-Speech (KittenTTS local)", True, None))
-        else:
-            tool_status.append(("Text-to-Speech (KittenTTS — not installed)", False, "run 'hermes setup tts'"))
    else:
        tool_status.append(("Text-to-Speech (Edge TTS)", True, None))

@@ -901,7 +891,6 @@ def _install_neutts_deps() -> bool:
                return False
        else:
            print_warning("espeak-ng is required for NeuTTS. Install it manually before using NeuTTS.")
-            return False

    # Install neutts Python package
    print()
@@ -921,34 +910,8 @@ def _install_neutts_deps() -> bool:
        return False


-def _install_kittentts_deps() -> bool:
-    """Install KittenTTS dependencies with user approval. Returns True on success."""
-    import subprocess
-    import sys
-
-    wheel_url = (
-        "https://github.com/KittenML/KittenTTS/releases/download/"
-        "0.8.1/kittentts-0.8.1-py3-none-any.whl"
-    )
-    print()
-    print_info("Installing kittentts Python package (~25-80MB model downloaded on first use)...")
-    print()
-    try:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
-            check=True, timeout=300,
-        )
-        print_success("kittentts installed successfully")
-        return True
-    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
-        print_error(f"Failed to install kittentts: {e}")
-        print_info(f"Try manually: python -m pip install -U '{wheel_url}' soundfile")
-        return False
-
-
 def _setup_tts_provider(config: dict):
-    """Interactive TTS provider selection with install flow for local providers."""
-
+    """Interactive TTS provider selection with install flow for NeuTTS."""
    tts_config = config.get("tts", {})
    current_provider = tts_config.get("provider", "edge")
    subscription_features = get_nous_subscription_features(config)
@@ -960,7 +923,6 @@ def _setup_tts_provider(config: dict):
        "minimax": "MiniMax TTS",
        "mistral": "Mistral Voxtral TTS",
        "neutts": "NeuTTS",
-        "kittentts": "KittenTTS",
    }
    current_label = provider_labels.get(current_provider, current_provider)

@@ -982,10 +944,9 @@ def _setup_tts_provider(config: dict):
            "MiniMax TTS (high quality with voice cloning, needs API key)",
            "Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
            "NeuTTS (local on-device, free, ~300MB model download)",
-            "KittenTTS (local on-device, free, lightweight ~25-80MB ONNX)",
        ]
    )
-    providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts", "kittentts"])
+    providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
    choices.append(f"Keep current ({current_label})")
    keep_current_idx = len(choices) - 1
    idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@@ -1027,28 +988,6 @@ def _setup_tts_provider(config: dict):
                print_info("Skipping install. Set tts.provider to 'neutts' after installing manually.")
                selected = "edge"

-    elif selected == "kittentts":
-        try:
-            import importlib.util
-            already_installed = importlib.util.find_spec("kittentts") is not None
-        except Exception:
-            already_installed = False
-
-        if already_installed:
-            print_success("KittenTTS is already installed")
-        else:
-            print()
-            print_info("KittenTTS is lightweight (~25-80MB, CPU-only, no API key required).")
-            print_info("Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
-            print()
-            if prompt_yes_no("Install KittenTTS now?", True):
-                if not _install_kittentts_deps():
-                    print_warning("KittenTTS installation incomplete. Falling back to Edge TTS.")
-                    selected = "edge"
-            else:
-                print_info("Skipping install. Set tts.provider to 'kittentts' after installing manually.")
-                selected = "edge"
-
    elif selected == "elevenlabs":
        existing = get_env_value("ELEVENLABS_API_KEY")
        if not existing:
--- a/hermes_cli/tools_config.py
+++ b/hermes_cli/tools_config.py
@@ -164,14 +164,6 @@ TOOL_CATEGORIES = {
                ],
                "tts_provider": "mistral",
            },
-            {
-                "name": "KittenTTS",
-                "badge": "local · free",
-                "tag": "Lightweight local ONNX TTS (~25MB), no API key",
-                "env_vars": [],
-                "tts_provider": "kittentts",
-                "post_setup": "kittentts",
-            },
        ],
    },
    "web": {
@@ -411,36 +403,6 @@ def _run_post_setup(post_setup_key: str):
            _print_warning("    Node.js not found. Install Camofox via Docker:")
            _print_info("      docker run -p 9377:9377 -e CAMOFOX_PORT=9377 jo-inc/camofox-browser")

-    elif post_setup_key == "kittentts":
-        try:
-            __import__("kittentts")
-            _print_success("    kittentts is already installed")
-            return
-        except ImportError:
-            pass
-        import subprocess
-        _print_info("    Installing kittentts (~25-80MB model, CPU-only)...")
-        wheel_url = (
-            "https://github.com/KittenML/KittenTTS/releases/download/"
-            "0.8.1/kittentts-0.8.1-py3-none-any.whl"
-        )
-        try:
-            result = subprocess.run(
-                [sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"],
-                capture_output=True, text=True, timeout=300,
-            )
-            if result.returncode == 0:
-                _print_success("    kittentts installed")
-                _print_info("    Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo")
-                _print_info("    Models: KittenML/kitten-tts-nano-0.8-int8 (25MB), micro (41MB), mini (80MB)")
-            else:
-                _print_warning("    kittentts install failed:")
-                _print_info(f"      {result.stderr.strip()[:300]}")
-                _print_info(f"    Run manually: python -m pip install -U '{wheel_url}' soundfile")
-        except subprocess.TimeoutExpired:
-            _print_warning("    kittentts install timed out (>5min)")
-            _print_info(f"    Run manually: python -m pip install -U '{wheel_url}' soundfile")
-
    elif post_setup_key == "rl_training":
        try:
            __import__("tinker_atropos")
--- a/metrics/vision-benchmark-smoke-2026-04-22.json
+++ b/metrics/vision-benchmark-smoke-2026-04-22.json
@@ -0,0 +1,67 @@
+{
+  "generated_at": "2026-04-22T16:21:56.271426+00:00",
+  "config": {
+    "total_images": 2,
+    "runs_per_model": 1,
+    "models": {
+      "gemma4": "Gemma 4 27B",
+      "gemini3_flash": "Gemini 3 Flash Preview"
+    }
+  },
+  "results": [
+    {
+      "gemma4": {
+        "success": false,
+        "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
+        "runs": 0,
+        "errors": 1
+      },
+      "gemini3_flash": {
+        "success": false,
+        "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
+        "runs": 0,
+        "errors": 1
+      },
+      "image_id": "screenshot_github_mark",
+      "category": "screenshot"
+    },
+    {
+      "gemma4": {
+        "success": false,
+        "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
+        "runs": 0,
+        "errors": 1
+      },
+      "gemini3_flash": {
+        "success": false,
+        "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
+        "runs": 0,
+        "errors": 1
+      },
+      "image_id": "screenshot_github_social",
+      "category": "screenshot"
+    }
+  ],
+  "summary": {
+    "gemma4": {
+      "success_rate": 0,
+      "error": "All runs failed",
+      "total_runs": 0,
+      "total_failures": 2,
+      "failure_examples": [
+        "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
+        "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
+      ]
+    },
+    "gemini3_flash": {
+      "success_rate": 0,
+      "error": "All runs failed",
+      "total_runs": 0,
+      "total_failures": 2,
+      "failure_examples": [
+        "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
+        "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
+      ]
+    }
+  }
+}
--- a/metrics/vision-benchmark-smoke-2026-04-22.md
+++ b/metrics/vision-benchmark-smoke-2026-04-22.md
@@ -0,0 +1,44 @@
+# Vision Benchmark Report
+
+Generated: 2026-04-22T16:21
+Images tested: 2
+Runs per model: 1
+Models: Gemma 4 27B, Gemini 3 Flash Preview
+
+## Latency Comparison
+
+| Model | Mean (ms) | Median | P95 | Std Dev |
+|-------|-----------|--------|-----|---------|
+
+## Accuracy Comparison
+
+| Model | OCR Accuracy | Keyword Coverage | Success Rate |
+|-------|-------------|-----------------|--------------|
+
+## Token Usage
+
+| Model | Mean Tokens/Image | Total Tokens |
+|-------|------------------|--------------|
+
+## Failure Modes
+
+### Gemma 4 27B
+- Summary: All runs failed
+- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
+- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
+
+### Gemini 3 Flash Preview
+- Summary: All runs failed
+- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
+- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
+
+
+## Verdict
+
+Benchmark blocked or insufficient data for a trustworthy winner.
+
+Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.
--- a/tests/test_vision_benchmark.py
+++ b/tests/test_vision_benchmark.py
@@ -199,7 +199,7 @@ class TestMarkdown:
 class TestDataset:
    def test_sample_dataset_has_entries(self):
        dataset = generate_sample_dataset()
-        assert len(dataset) >= 4
+        assert len(dataset) >= 50

    def test_sample_dataset_structure(self):
        dataset = generate_sample_dataset()
@@ -216,6 +216,9 @@ class TestDataset:
        assert "screenshot" in categories
        assert "diagram" in categories
        assert "photo" in categories
+        assert "chart" in categories
+        assert "ocr" in categories
+        assert "document" in categories


 class TestModels:
--- a/tests/test_vision_benchmark_artifacts.py
+++ b/tests/test_vision_benchmark_artifacts.py
@@ -0,0 +1,21 @@
+import json
+from pathlib import Path
+
+
+DATASET = Path("benchmarks/test_images.json")
+REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
+
+
+def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
+    items = json.loads(DATASET.read_text(encoding="utf-8"))
+    assert len(items) >= 50
+    categories = {item["category"] for item in items}
+    assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
+
+
+def test_metrics_report_exists_with_recommendation() -> None:
+    assert REPORT.exists(), "missing benchmark report under metrics/"
+    text = REPORT.read_text(encoding="utf-8")
+    assert "Recommendation" in text
+    assert "Gemma 4" in text
+    assert "Gemini" in text
--- a/tests/tools/test_tts_kittentts.py
+++ b/tests/tools/test_tts_kittentts.py
@@ -1,236 +0,0 @@
-"""Tests for the KittenTTS local provider in tools/tts_tool.py."""
-
-import json
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-
-
-@pytest.fixture(autouse=True)
-def clean_env(monkeypatch):
-    for key in ("HERMES_SESSION_PLATFORM",):
-        monkeypatch.delenv(key, raising=False)
-
-
-@pytest.fixture(autouse=True)
-def clear_kittentts_cache():
-    """Reset the module-level model cache between tests."""
-    from tools import tts_tool as _tt
-    _tt._kittentts_model_cache.clear()
-    yield
-    _tt._kittentts_model_cache.clear()
-
-
-@pytest.fixture
-def mock_kittentts_module():
-    """Inject a fake kittentts + soundfile module that return stub objects."""
-    fake_model = MagicMock()
-    # 24kHz float32 PCM at ~2s of silence
-    fake_model.generate.return_value = np.zeros(48000, dtype=np.float32)
-    fake_cls = MagicMock(return_value=fake_model)
-    fake_kittentts = MagicMock()
-    fake_kittentts.KittenTTS = fake_cls
-
-    # Stub soundfile — the real package isn't installed in CI venv, and
-    # _generate_kittentts does `import soundfile as sf` at runtime.
-    fake_sf = MagicMock()
-
-    def _fake_write(path, audio, samplerate):
-        # Emulate writing a real file so downstream path checks succeed.
-        import pathlib
-
-        pathlib.Path(path).write_bytes(b"RIFF\x00\x00\x00\x00WAVEfmt fake")
-
-    fake_sf.write = _fake_write
-
-    with patch.dict(
-        "sys.modules",
-        {"kittentts": fake_kittentts, "soundfile": fake_sf},
-    ):
-        yield fake_model, fake_cls
-
-
-class TestGenerateKittenTts:
-    def test_successful_wav_generation(self, tmp_path, mock_kittentts_module):
-        from tools.tts_tool import _generate_kittentts
-
-        fake_model, fake_cls = mock_kittentts_module
-        output_path = str(tmp_path / "test.wav")
-        result = _generate_kittentts("Hello world", output_path, {})
-
-        assert result == output_path
-        assert (tmp_path / "test.wav").exists()
-        fake_cls.assert_called_once()
-        fake_model.generate.assert_called_once()
-
-    def test_config_passes_voice_speed_cleantext(self, tmp_path, mock_kittentts_module):
-        from tools.tts_tool import _generate_kittentts
-
-        fake_model, _ = mock_kittentts_module
-        config = {
-            "kittentts": {
-                "model": "KittenML/kitten-tts-mini-0.8",
-                "voice": "Luna",
-                "speed": 1.25,
-                "clean_text": False,
-            }
-        }
-        _generate_kittentts("Hi there", str(tmp_path / "out.wav"), config)
-
-        call_kwargs = fake_model.generate.call_args.kwargs
-        assert call_kwargs["voice"] == "Luna"
-        assert call_kwargs["speed"] == 1.25
-        assert call_kwargs["clean_text"] is False
-
-    def test_default_model_and_voice(self, tmp_path, mock_kittentts_module):
-        from tools.tts_tool import (
-            DEFAULT_KITTENTTS_MODEL,
-            DEFAULT_KITTENTTS_VOICE,
-            _generate_kittentts,
-        )
-
-        fake_model, fake_cls = mock_kittentts_module
-        _generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
-
-        fake_cls.assert_called_once_with(DEFAULT_KITTENTTS_MODEL)
-        assert fake_model.generate.call_args.kwargs["voice"] == DEFAULT_KITTENTTS_VOICE
-
-    def test_model_is_cached_across_calls(self, tmp_path, mock_kittentts_module):
-        from tools.tts_tool import _generate_kittentts
-
-        _, fake_cls = mock_kittentts_module
-        _generate_kittentts("One", str(tmp_path / "a.wav"), {})
-        _generate_kittentts("Two", str(tmp_path / "b.wav"), {})
-
-        # Same model name → class instantiated exactly once
-        assert fake_cls.call_count == 1
-
-    def test_different_models_are_cached_separately(self, tmp_path, mock_kittentts_module):
-        from tools.tts_tool import _generate_kittentts
-
-        _, fake_cls = mock_kittentts_module
-        _generate_kittentts(
-            "A",
-            str(tmp_path / "a.wav"),
-            {"kittentts": {"model": "KittenML/kitten-tts-nano-0.8-int8"}},
-        )
-        _generate_kittentts(
-            "B",
-            str(tmp_path / "b.wav"),
-            {"kittentts": {"model": "KittenML/kitten-tts-mini-0.8"}},
-        )
-
-        assert fake_cls.call_count == 2
-
-    def test_non_wav_extension_triggers_ffmpeg_conversion(
-        self, tmp_path, mock_kittentts_module, monkeypatch
-    ):
-        """Non-.wav output path causes WAV → target ffmpeg conversion."""
-        from tools import tts_tool as _tt
-
-        calls = []
-
-        def fake_shutil_which(cmd):
-            return "/usr/bin/ffmpeg" if cmd == "ffmpeg" else None
-
-        def fake_run(cmd, check=False, timeout=None, **kw):
-            calls.append(cmd)
-            # Emulate ffmpeg writing the output file
-            import pathlib
-
-            out_path = cmd[-1]
-            pathlib.Path(out_path).write_bytes(b"fake-mp3-data")
-            return MagicMock(returncode=0)
-
-        monkeypatch.setattr(_tt.shutil, "which", fake_shutil_which)
-        monkeypatch.setattr(_tt.subprocess, "run", fake_run)
-
-        output_path = str(tmp_path / "test.mp3")
-        result = _tt._generate_kittentts("Hi", output_path, {})
-
-        assert result == output_path
-        assert len(calls) == 1
-        assert calls[0][0] == "/usr/bin/ffmpeg"
-
-    def test_missing_kittentts_raises_import_error(self, tmp_path, monkeypatch):
-        """When kittentts package is not installed, _import_kittentts raises."""
-        import sys
-
-        monkeypatch.setitem(sys.modules, "kittentts", None)
-        from tools.tts_tool import _generate_kittentts
-
-        with pytest.raises((ImportError, TypeError)):
-            _generate_kittentts("Hi", str(tmp_path / "out.wav"), {})
-
-
-class TestCheckKittenttsAvailable:
-    def test_reports_available_when_package_present(self, monkeypatch):
-        import importlib.util
-        from tools.tts_tool import _check_kittentts_available
-
-        fake_spec = MagicMock()
-        monkeypatch.setattr(
-            importlib.util,
-            "find_spec",
-            lambda name: fake_spec if name == "kittentts" else None,
-        )
-        assert _check_kittentts_available() is True
-
-    def test_reports_unavailable_when_package_missing(self, monkeypatch):
-        import importlib.util
-        from tools.tts_tool import _check_kittentts_available
-
-        monkeypatch.setattr(importlib.util, "find_spec", lambda name: None)
-        assert _check_kittentts_available() is False
-
-
-class TestDispatcherBranch:
-    def test_kittentts_not_installed_returns_helpful_error(self, monkeypatch, tmp_path):
-        """When provider=kittentts but package missing, return JSON error with setup hint."""
-        import sys
-
-        monkeypatch.setitem(sys.modules, "kittentts", None)
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-
-        from tools.tts_tool import text_to_speech_tool
-
-        # Write a config telling it to use kittentts
-        import yaml
-
-        (tmp_path / "config.yaml").write_text(
-            yaml.safe_dump({"tts": {"provider": "kittentts"}})
-        )
-
-        result = json.loads(text_to_speech_tool(text="Hello"))
-        assert result["success"] is False
-        assert "kittentts" in result["error"].lower()
-        assert "hermes setup tts" in result["error"].lower()
-
-    def test_non_telegram_explicit_wav_path_is_preserved(
-        self, monkeypatch, tmp_path, mock_kittentts_module
-    ):
-        """Explicit WAV outputs should stay WAV outside Telegram sessions."""
-        import yaml
-        from tools import tts_tool as _tt
-
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-        (tmp_path / "config.yaml").write_text(
-            yaml.safe_dump({"tts": {"provider": "kittentts"}})
-        )
-
-        def fail_convert(_path):
-            raise AssertionError("_convert_to_opus should not run outside Telegram")
-
-        monkeypatch.setattr(_tt, "_convert_to_opus", fail_convert)
-
-        result = json.loads(
-            _tt.text_to_speech_tool(
-                text="Hello from KittenTTS",
-                output_path=str(tmp_path / "out.wav"),
-            )
-        )
-
-        assert result["success"] is True
-        assert result["file_path"] == str(tmp_path / "out.wav")
-        assert (tmp_path / "out.wav").exists()
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -2,14 +2,13 @@
 """
 Text-to-Speech Tool Module

-Supports seven TTS providers:
+Supports six TTS providers:
 - Edge TTS (default, free, no API key): Microsoft Edge neural voices
 - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
 - OpenAI TTS: Good quality, needs OPENAI_API_KEY
 - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
 - Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
 - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
- KittenTTS (local, free, no API key): Lightweight on-device ONNX TTS via kittentts

 Output formats:
 - Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@@ -78,12 +77,6 @@ def _import_sounddevice():
    return sd


-def _import_kittentts():
-    """Lazy import KittenTTS. Returns the class or raises ImportError."""
-    from kittentts import KittenTTS
-    return KittenTTS
-
-
 # ===========================================================================
 # Defaults
 # ===========================================================================
@@ -93,8 +86,6 @@ DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB"  # Adam
 DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
 DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
 DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
-DEFAULT_KITTENTTS_MODEL = "KittenML/kitten-tts-nano-0.8-int8"  # 25MB
-DEFAULT_KITTENTTS_VOICE = "Jasper"
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
 DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
@@ -457,15 +448,6 @@ def _check_neutts_available() -> bool:
        return False


-def _check_kittentts_available() -> bool:
-    """Check if the kittentts engine is importable (installed locally)."""
-    try:
-        import importlib.util
-        return importlib.util.find_spec("kittentts") is not None
-    except Exception:
-        return False
-
-
 def _default_neutts_ref_audio() -> str:
    """Return path to the bundled default voice reference audio."""
    return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
@@ -529,51 +511,6 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
    return output_path


-# ===========================================================================
-# Provider: KittenTTS (local, lightweight)
-# ===========================================================================
-
-# Module-level cache for KittenTTS model instances
-_kittentts_model_cache: Dict[str, Any] = {}
-
-
-def _generate_kittentts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
-    """Generate speech using the local KittenTTS ONNX model."""
-    KittenTTS = _import_kittentts()
-    kt_config = tts_config.get("kittentts", {})
-    model_name = kt_config.get("model", DEFAULT_KITTENTTS_MODEL)
-    voice = kt_config.get("voice", DEFAULT_KITTENTTS_VOICE)
-    speed = kt_config.get("speed", 1.0)
-    clean_text = kt_config.get("clean_text", True)
-
-    global _kittentts_model_cache
-    if model_name not in _kittentts_model_cache:
-        logger.info("[KittenTTS] Loading model: %s", model_name)
-        _kittentts_model_cache[model_name] = KittenTTS(model_name)
-
-    model = _kittentts_model_cache[model_name]
-    audio = model.generate(text, voice=voice, speed=speed, clean_text=clean_text)
-
-    import soundfile as sf
-
-    wav_path = output_path
-    if not output_path.endswith(".wav"):
-        wav_path = output_path.rsplit(".", 1)[0] + ".wav"
-
-    sf.write(wav_path, audio, 24000)
-
-    if wav_path != output_path:
-        ffmpeg = shutil.which("ffmpeg")
-        if ffmpeg:
-            conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
-            subprocess.run(conv_cmd, check=True, timeout=30)
-            os.remove(wav_path)
-        else:
-            os.rename(wav_path, output_path)
-
-    return output_path
-
-
 # ===========================================================================
 # Main tool function
 # ===========================================================================
@@ -685,19 +622,6 @@ def text_to_speech_tool(
            logger.info("Generating speech with NeuTTS (local)...")
            _generate_neutts(text, file_str, tts_config)

-        elif provider == "kittentts":
-            try:
-                _import_kittentts()
-            except ImportError:
-                return json.dumps({
-                    "success": False,
-                    "error": "KittenTTS provider selected but 'kittentts' package not installed. "
-                             "Run 'hermes setup tts' and choose KittenTTS, or install manually: "
-                             "pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl"
-                }, ensure_ascii=False)
-            logger.info("Generating speech with KittenTTS (local, lightweight)...")
-            _generate_kittentts(text, file_str, tts_config)
-
        else:
            # Default: Edge TTS (free), with NeuTTS as local fallback
            edge_available = True
@@ -734,10 +658,10 @@ def text_to_speech_tool(
                "error": f"TTS generation produced no output (provider: {provider})"
            }, ensure_ascii=False)

-        # Try Opus conversion for Telegram compatibility only.
-        # Outside Telegram, preserve the caller's explicit output format.
+        # Try Opus conversion for Telegram compatibility
+        # Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
        voice_compatible = False
-        if want_opus and provider in ("edge", "neutts", "minimax", "kittentts") and not file_str.endswith(".ogg"):
+        if provider in ("edge", "neutts", "minimax") and not file_str.endswith(".ogg"):
            opus_path = _convert_to_opus(file_str)
            if opus_path:
                file_str = opus_path
@@ -818,8 +742,6 @@ def check_tts_requirements() -> bool:
        pass
    if _check_neutts_available():
        return True
-    if _check_kittentts_available():
-        return True
    return False


--- a/website/docs/user-guide/features/tts.md
+++ b/website/docs/user-guide/features/tts.md
@@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription

 ## Text-to-Speech

-Convert text to speech with seven providers:
+Convert text to speech with six providers:

 | Provider | Quality | Cost | API Key |
 |----------|---------|------|---------|
@@ -20,7 +20,6 @@ Convert text to speech with seven providers:
 | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
 | **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
 | **NeuTTS** | Good | Free | None needed |
-| **KittenTTS** | Good | Free (local) | None needed |

 ### Platform Delivery

@@ -36,7 +35,7 @@ Convert text to speech with seven providers:
 ```yaml
 # In ~/.hermes/config.yaml
 tts:
-  provider: "edge"              # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" | "kittentts"
+  provider: "edge"              # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
  speed: 1.0                    # Global speed multiplier (provider-specific settings override this)
  edge:
    voice: "en-US-AriaNeural"   # 322 voices, 74 languages
@@ -63,11 +62,6 @@ tts:
    ref_text: ''
    model: neuphonic/neutts-air-q4-gguf
    device: cpu
-  kittentts:
-    model: KittenML/kitten-tts-nano-0.8-int8   # 25MB int8 default; also micro and mini variants
-    voice: Jasper                               # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo
-    speed: 1.0
-    clean_text: true
 ```

 **Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
@@ -80,7 +74,6 @@ Telegram voice bubbles require Opus/OGG audio format:
 - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
 - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
 - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
- **KittenTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles

 ```bash
 # Ubuntu/Debian
@@ -93,7 +86,7 @@ brew install ffmpeg
 sudo dnf install ffmpeg
 ```

-Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
+Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).

 :::tip
 If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.
Author	SHA1	Message	Date
Alexander Whitestone	9d05f77a9b	feat: harden vision benchmark artifacts All checks were successful Lint / lint (pull_request) Successful in 9s Details Refs #817	2026-04-22 12:22:28 -04:00
Alexander Whitestone	23e093fc75	wip: tighten vision benchmark acceptance tests	2026-04-22 12:10:23 -04:00
Alexander Whitestone	f77ce4dff2	wip: add regression tests for vision benchmark artifacts	2026-04-22 12:07:52 -04:00