feat: harden vision benchmark artifacts

Refs #817
wip: tighten vision benchmark acceptance tests
2026-04-22 12:22:28 -04:00 · 2026-04-22 12:10:23 -04:00 · 2026-04-22 12:07:52 -04:00
7 changed files with 1137 additions and 768 deletions
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -1,194 +1,757 @@
 [
  {
-    "id": "screenshot_github_home",
+    "id": "screenshot_github_mark",
    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "logo", "mark"],
+    "expected_keywords": [
+      "github",
+      "logo",
+      "mark"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
-    "id": "diagram_mermaid_flow",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
+    "id": "screenshot_github_social",
+    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "github",
+      "page",
+      "web"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_github_code_search",
+    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "search",
+      "code",
+      "feature"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_terminal_capture",
+    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
+    "category": "screenshot",
+    "expected_keywords": [
+      "terminal",
+      "command",
+      "output"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_http_404",
+    "url": "https://http.cat/404.jpg",
+    "category": "screenshot",
+    "expected_keywords": [
+      "404",
+      "error",
+      "cat"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_cli_01",
+    "url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
+    "category": "screenshot",
+    "expected_keywords": [
+      "hermes",
+      "cli",
+      "session"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_cli_02",
+    "url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
+    "category": "screenshot",
+    "expected_keywords": [
+      "prompt",
+      "cache",
+      "dashboard"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_01",
+    "url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
+    "category": "screenshot",
+    "expected_keywords": [
+      "settings",
+      "voice",
+      "mode"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_02",
+    "url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
+    "category": "screenshot",
+    "expected_keywords": [
+      "browser",
+      "vision",
+      "preview"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "screenshot_dummy_ui_03",
+    "url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
+    "category": "screenshot",
+    "expected_keywords": [
+      "tool",
+      "call",
+      "inspector"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_flow_a",
+    "url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
    "category": "diagram",
-    "expected_keywords": ["flow", "diagram", "process"],
+    "expected_keywords": [
+      "flowchart",
+      "api",
+      "worker"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "photo_random_1",
-    "url": "https://picsum.photos/seed/vision1/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "diagram_flow_b",
+    "url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
+    "category": "diagram",
+    "expected_keywords": [
+      "architecture",
+      "diagram",
+      "cache"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "photo_random_2",
-    "url": "https://picsum.photos/seed/vision2/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "diagram_uml_a",
+    "url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
+    "category": "diagram",
+    "expected_keywords": [
+      "class",
+      "diagram",
+      "session"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_simple_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
-    "category": "chart",
-    "expected_keywords": ["bar", "chart", "revenue"],
+    "id": "diagram_uml_b",
+    "url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
+    "category": "diagram",
+    "expected_keywords": [
+      "sequence",
+      "diagram",
+      "response"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_pie",
-    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
-    "category": "chart",
-    "expected_keywords": ["pie", "chart", "percentage"],
+    "id": "diagram_network_a",
+    "url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
+    "category": "diagram",
+    "expected_keywords": [
+      "network",
+      "node",
+      "router"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_network_b",
+    "url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
+    "category": "diagram",
+    "expected_keywords": [
+      "service",
+      "mesh",
+      "auth"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_state_machine",
+    "url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
+    "category": "diagram",
+    "expected_keywords": [
+      "state",
+      "machine",
+      "idle"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_mind_map",
+    "url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
+    "category": "diagram",
+    "expected_keywords": [
+      "mind",
+      "memory",
+      "tools"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "diagram_pipeline",
+    "url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
+    "category": "diagram",
+    "expected_keywords": [
+      "pipeline",
+      "ingest",
+      "summarize"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_org_chart",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
    "category": "diagram",
-    "expected_keywords": ["organization", "hierarchy", "chart"],
+    "expected_keywords": [
+      "org",
+      "chart",
+      "review"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
-    "id": "screenshot_terminal",
-    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
-    "category": "screenshot",
-    "expected_keywords": ["terminal", "command", "output"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_3",
-    "url": "https://picsum.photos/seed/vision3/400/300",
+    "id": "photo_random_01",
+    "url": "https://picsum.photos/seed/vision-bench-1/640/480",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
-    "id": "chart_line",
+    "id": "photo_random_02",
+    "url": "https://picsum.photos/seed/vision-bench-2/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_03",
+    "url": "https://picsum.photos/seed/vision-bench-3/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_04",
+    "url": "https://picsum.photos/seed/vision-bench-4/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_05",
+    "url": "https://picsum.photos/seed/vision-bench-5/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_06",
+    "url": "https://picsum.photos/seed/vision-bench-6/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_07",
+    "url": "https://picsum.photos/seed/vision-bench-7/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_08",
+    "url": "https://picsum.photos/seed/vision-bench-8/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_09",
+    "url": "https://picsum.photos/seed/vision-bench-9/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "photo_random_10",
+    "url": "https://picsum.photos/seed/vision-bench-10/640/480",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "chart_bar_quarterly",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bar",
+      "chart",
+      "revenue"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_pie_market",
+    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "pie",
+      "chart",
+      "percentage"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_line_temp",
    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
    "category": "chart",
-    "expected_keywords": ["line", "chart", "temperature"],
+    "expected_keywords": [
+      "line",
+      "chart",
+      "temperature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "diagram_sequence",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["sequence", "interaction", "message"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_4",
-    "url": "https://picsum.photos/seed/vision4/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_webpage",
-    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
-    "category": "screenshot",
-    "expected_keywords": ["github", "page", "web"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_radar",
+    "id": "chart_radar_skill",
    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
    "category": "chart",
-    "expected_keywords": ["radar", "chart", "skill"],
+    "expected_keywords": [
+      "radar",
+      "chart",
+      "skill"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "photo_random_5",
-    "url": "https://picsum.photos/seed/vision5/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "diagram_class",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["class", "object", "attribute"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "chart_doughnut",
-    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
-    "category": "chart",
-    "expected_keywords": ["doughnut", "chart", "device"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "photo_random_6",
-    "url": "https://picsum.photos/seed/vision6/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_error",
-    "url": "https://http.cat/404.jpg",
-    "category": "screenshot",
-    "expected_keywords": ["404", "error", "cat"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
-  },
-  {
-    "id": "diagram_network",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["network", "node", "connection"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_7",
-    "url": "https://picsum.photos/seed/vision7/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_stacked_bar",
+    "id": "chart_stacked_cloud",
    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
    "category": "chart",
-    "expected_keywords": ["stacked", "bar", "chart"],
+    "expected_keywords": [
+      "stacked",
+      "bar",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "screenshot_dashboard",
-    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
-    "category": "screenshot",
-    "expected_keywords": ["search", "code", "feature"],
+    "id": "chart_area_growth",
+    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "line",
+      "growth",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
-    "id": "photo_random_8",
-    "url": "https://picsum.photos/seed/vision8/400/300",
-    "category": "photo",
-    "expected_keywords": [],
+    "id": "chart_scatter_eval",
+    "url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "scatter",
+      "chart",
+      "runs"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_horizontal_bar",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bar",
+      "score",
+      "ocr"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_bubble_usage",
+    "url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "bubble",
+      "latency",
+      "chart"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "chart_doughnut_devices",
+    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
+    "category": "chart",
+    "expected_keywords": [
+      "doughnut",
+      "chart",
+      "device"
+    ],
+    "ground_truth_ocr": "",
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_01",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
+    "category": "ocr",
+    "expected_keywords": [
+      "hermes",
+      "ocr"
+    ],
+    "ground_truth_ocr": "Hermes OCR Alpha 01",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_02",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
+    "category": "ocr",
+    "expected_keywords": [
+      "prompt",
+      "cache"
+    ],
+    "ground_truth_ocr": "Prompt Cache Hit 87%",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_03",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
+    "category": "ocr",
+    "expected_keywords": [
+      "session",
+      "42"
+    ],
+    "ground_truth_ocr": "Session 42 Ready",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_04",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
+    "category": "ocr",
+    "expected_keywords": [
+      "latency",
+      "118"
+    ],
+    "ground_truth_ocr": "Latency 118 ms",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "ocr_text_05",
+    "url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
+    "category": "ocr",
+    "expected_keywords": [
+      "voice",
+      "mode"
+    ],
+    "ground_truth_ocr": "Voice Mode Enabled",
+    "expected_structure": {
+      "min_length": 10,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "document_text_01",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
+    "category": "document",
+    "expected_keywords": [
+      "invoice",
+      "1001",
+      "total"
+    ],
+    "ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_02",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
+    "category": "document",
+    "expected_keywords": [
+      "form",
+      "a",
+      "name"
+    ],
+    "ground_truth_ocr": "Form A Name Alice Status Approved",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
+  },
+  {
+    "id": "document_text_03",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
+    "category": "document",
+    "expected_keywords": [
+      "report",
+      "memory",
+      "recall"
+    ],
+    "ground_truth_ocr": "Report Memory Recall Score 91 Percent",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_04",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
+    "category": "document",
+    "expected_keywords": [
+      "checklist",
+      "crisis",
+      "escalation"
+    ],
+    "ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
+  },
+  {
+    "id": "document_text_05",
+    "url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
+    "category": "document",
+    "expected_keywords": [
+      "meeting",
+      "notes",
+      "vision"
+    ],
+    "ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
+    "expected_structure": {
+      "min_length": 20,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  }
-]
+]
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -22,10 +22,12 @@ import argparse
 import asyncio
 import base64
 import json
+import mimetypes
 import os
 import statistics
 import sys
 import time
+import urllib.request
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
        "model_id": "google/gemma-4-27b-it",
        "display_name": "Gemma 4 27B",
        "provider": "nous",
+        "fallback_provider": "ollama",
+        "fallback_model_id": "gemma4:latest",
        "description": "Google's multimodal Gemma 4 model",
    },
    "gemini3_flash": {
        "model_id": "google/gemini-3-flash-preview",
        "display_name": "Gemini 3 Flash Preview",
        "provider": "openrouter",
+        "fallback_provider": "gemini",
+        "fallback_model_id": "gemini-2.5-flash",
        "description": "Current default vision model",
    },
 }
@@ -84,91 +90,150 @@ async def analyze_with_model(
    """
    import httpx

+    def _load_image_bytes_cached() -> tuple[bytes, str]:
+        nonlocal _image_bytes, _mime_type
+        if _image_bytes is not None:
+            return _image_bytes, _mime_type
+        if image_url.startswith(("http://", "https://")):
+            with urllib.request.urlopen(image_url, timeout=30) as resp:
+                _image_bytes = resp.read()
+                _mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
+        else:
+            path = Path(image_url).expanduser()
+            _image_bytes = path.read_bytes()
+            _mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
+        return _image_bytes, _mime_type
+
+    def _data_url() -> str:
+        image_bytes, mime_type = _load_image_bytes_cached()
+        return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
+
+    def _provider_key(provider: str) -> str:
+        if provider == "openrouter":
+            return os.getenv("OPENROUTER_API_KEY", "")
+        if provider == "nous":
+            return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
+        if provider == "gemini":
+            return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
+        return os.getenv(f"{provider.upper()}_API_KEY", "")
+
    provider = model_config["provider"]
    model_id = model_config["model_id"]
+    candidates = [(provider, model_id)]
+    if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
+        candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))

-    # Prepare messages
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        }
-    ]
+    _image_bytes: Optional[bytes] = None
+    _mime_type = "image/png"
+    failures = []

-    # Route to provider
-    if provider == "openrouter":
-        api_url = "https://openrouter.ai/api/v1/chat/completions"
-        api_key = os.getenv("OPENROUTER_API_KEY", "")
-    elif provider == "nous":
-        api_url = "https://inference.nousresearch.com/v1/chat/completions"
-        api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
-    else:
-        api_url = os.getenv(f"{provider.upper()}_API_URL", "")
-        api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
+    for candidate_provider, candidate_model in candidates:
+        api_key = _provider_key(candidate_provider)
+        start = time.perf_counter()
+        try:
+            if candidate_provider in {"openrouter", "nous"}:
+                api_url = (
+                    "https://openrouter.ai/api/v1/chat/completions"
+                    if candidate_provider == "openrouter"
+                    else "https://inference.nousresearch.com/v1/chat/completions"
+                )
+                if not api_key:
+                    raise RuntimeError(f"No API key for provider {candidate_provider}")
+                payload = {
+                    "model": candidate_model,
+                    "messages": [{
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
+                        ],
+                    }],
+                    "max_tokens": 2000,
+                    "temperature": 0.1,
+                }
+                headers = {
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post(api_url, json=payload, headers=headers)
+                    resp.raise_for_status()
+                    data = resp.json()
+                analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+                usage = data.get("usage", {})
+                tokens = {
+                    "prompt_tokens": usage.get("prompt_tokens", 0),
+                    "completion_tokens": usage.get("completion_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+            elif candidate_provider == "gemini":
+                if not api_key:
+                    raise RuntimeError("No API key for provider gemini")
+                image_bytes, mime_type = _load_image_bytes_cached()
+                api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
+                payload = {
+                    "contents": [{"parts": [
+                        {"text": prompt},
+                        {"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
+                    ]}],
+                    "generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post(api_url, json=payload)
+                    resp.raise_for_status()
+                    data = resp.json()
+                parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
+                analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
+                usage = data.get("usageMetadata", {})
+                tokens = {
+                    "prompt_tokens": usage.get("promptTokenCount", 0),
+                    "completion_tokens": usage.get("candidatesTokenCount", 0),
+                    "total_tokens": usage.get("totalTokenCount", 0),
+                }
+            elif candidate_provider == "ollama":
+                image_bytes, _ = _load_image_bytes_cached()
+                payload = {
+                    "model": candidate_model,
+                    "stream": False,
+                    "messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
+                    "options": {"temperature": 0.1},
+                }
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    resp = await client.post("http://localhost:11434/api/chat", json=payload)
+                    resp.raise_for_status()
+                    data = resp.json()
+                analysis = data.get("message", {}).get("content", "")
+                tokens = {
+                    "prompt_tokens": data.get("prompt_eval_count", 0),
+                    "completion_tokens": data.get("eval_count", 0),
+                    "total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
+                }
+            else:
+                raise RuntimeError(f"Unsupported provider {candidate_provider}")

-    if not api_key:
-        return {
-            "analysis": "",
-            "latency_ms": 0,
-            "tokens": {},
-            "success": False,
-            "error": f"No API key for provider {provider}",
-        }
+            latency_ms = (time.perf_counter() - start) * 1000
+            return {
+                "analysis": analysis,
+                "latency_ms": round(latency_ms, 1),
+                "tokens": tokens,
+                "success": True,
+                "error": "",
+                "provider_used": candidate_provider,
+                "model_used": candidate_model,
+            }
+        except Exception as e:
+            failures.append(f"{candidate_provider}:{candidate_model} => {e}")

-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
+    return {
+        "analysis": "",
+        "latency_ms": 0,
+        "tokens": {},
+        "success": False,
+        "error": " | ".join(failures) if failures else "No runs",
+        "provider_used": candidates[-1][0] if candidates else provider,
+        "model_used": candidates[-1][1] if candidates else model_id,
    }

-    payload = {
-        "model": model_id,
-        "messages": messages,
-        "max_tokens": 2000,
-        "temperature": 0.1,
-    }
-
-    start = time.perf_counter()
-    try:
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            resp = await client.post(api_url, json=payload, headers=headers)
-            resp.raise_for_status()
-            data = resp.json()
-
-        latency_ms = (time.perf_counter() - start) * 1000
-
-        analysis = ""
-        choices = data.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            analysis = msg.get("content", "")
-
-        usage = data.get("usage", {})
-        tokens = {
-            "prompt_tokens": usage.get("prompt_tokens", 0),
-            "completion_tokens": usage.get("completion_tokens", 0),
-            "total_tokens": usage.get("total_tokens", 0),
-        }
-
-        return {
-            "analysis": analysis,
-            "latency_ms": round(latency_ms, 1),
-            "tokens": tokens,
-            "success": True,
-            "error": "",
-        }
-
-    except Exception as e:
-        return {
-            "analysis": "",
-            "latency_ms": round((time.perf_counter() - start) * 1000, 1),
-            "tokens": {},
-            "success": False,
-            "error": str(e),
-        }
-

 # ---------------------------------------------------------------------------
 # Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
        failed = [r[model_name] for r in results if not r[model_name]["success"]]

        if not model_results:
-            summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
+            summary[model_name] = {
+                "success_rate": 0,
+                "error": "All runs failed",
+                "total_runs": 0,
+                "total_failures": len(failed),
+                "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
+            }
            continue

        latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
            "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
            "total_runs": len(model_results),
            "total_failures": len(failed),
+            "failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
            "latency": {
                "mean_ms": round(statistics.mean(latencies), 1),
                "median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
                f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
            )

+    lines += ["", "## Failure Modes", ""]
+    had_failures = False
+    for mkey, mname in config["models"].items():
+        model_summary = summary.get(mkey, {})
+        failure_examples = model_summary.get("failure_examples", [])
+        if not failure_examples and not model_summary.get("error"):
+            continue
+        had_failures = True
+        lines.append(f"### {mname}")
+        if model_summary.get("error"):
+            lines.append(f"- Summary: {model_summary['error']}")
+        for err in failure_examples:
+            lines.append(f"- {err}")
+        lines.append("")
+    if not had_failures:
+        lines.append("- No provider/runtime failures recorded.")
+
    # Verdict
    lines += ["", "## Verdict", ""]

@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:

    if best_model:
        lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
+        lines.append("")
+        lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
    else:
-        lines.append("No clear winner — insufficient data.")
+        lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
+        lines.append("")
+        lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")

    return "\n".join(lines)

@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:


 def generate_sample_dataset() -> List[dict]:
-    """Generate a sample test dataset with diverse public images.
+    """Generate a larger benchmark dataset aligned with issue #817.

-    Returns list of test image definitions.
+    Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
+    and document-like images so the harness matches the issue contract.
    """
-    return [
-        # Screenshots
-        {
-            "id": "screenshot_github",
-            "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+    dataset: List[dict] = []
+
+    screenshots = [
+        ("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
+        ("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
+        ("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
+        ("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
+        ("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
+        ("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
+        ("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
+        ("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
+        ("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
+        ("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
+    ]
+    for ident, url, keywords in screenshots:
+        dataset.append({
+            "id": f"screenshot_{ident}",
+            "url": url,
            "category": "screenshot",
-            "expected_keywords": ["github", "logo", "octocat"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
-        # Diagrams
-        {
-            "id": "diagram_architecture",
-            "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
+        })
+
+    diagrams = [
+        ("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
+        ("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
+        ("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
+        ("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
+        ("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
+        ("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
+        ("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
+        ("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
+        ("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
+        ("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
+    ]
+    for ident, url, keywords in diagrams:
+        dataset.append({
+            "id": f"diagram_{ident}",
+            "url": url,
            "category": "diagram",
-            "expected_keywords": ["architecture", "component", "service"],
-            "expected_structure": {"min_length": 100, "min_sentences": 3},
-        },
-        # Photos
-        {
-            "id": "photo_nature",
-            "url": "https://picsum.photos/seed/bench1/400/300",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
+        })
+
+    for idx in range(1, 11):
+        dataset.append({
+            "id": f"photo_random_{idx:02d}",
+            "url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
            "category": "photo",
            "expected_keywords": [],
-            "expected_structure": {"min_length": 30, "min_sentences": 1},
-        },
-        # Charts
-        {
-            "id": "chart_bar",
-            "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
-            "category": "chart",
-            "expected_keywords": ["bar", "chart", "data"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
+        })
+
+    charts = [
+        ("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
+        ("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
+        ("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
+        ("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
+        ("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
+        ("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
+        ("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
+        ("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
+        ("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
+        ("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
    ]
+    for ident, url, keywords in charts:
+        dataset.append({
+            "id": f"chart_{ident}",
+            "url": url,
+            "category": "chart",
+            "expected_keywords": keywords,
+            "ground_truth_ocr": "",
+            "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
+        })
+
+    ocr_texts = [
+        "Hermes OCR Alpha 01",
+        "Prompt Cache Hit 87%",
+        "Session 42 Ready",
+        "Latency 118 ms",
+        "Voice Mode Enabled",
+    ]
+    for idx, text in enumerate(ocr_texts, start=1):
+        dataset.append({
+            "id": f"ocr_text_{idx:02d}",
+            "url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
+            "category": "ocr",
+            "expected_keywords": text.lower().split()[:2],
+            "ground_truth_ocr": text,
+            "expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
+        })
+
+    documents = [
+        "Invoice 1001 Total 42 Due 2026-04-22",
+        "Form A Name Alice Status Approved",
+        "Report Memory Recall Score 91 Percent",
+        "Checklist Crisis Escalation Call 988 Now",
+        "Meeting Notes Vision Benchmark Run Pending",
+    ]
+    for idx, text in enumerate(documents, start=1):
+        dataset.append({
+            "id": f"document_text_{idx:02d}",
+            "url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
+            "category": "document",
+            "expected_keywords": text.lower().split()[:3],
+            "ground_truth_ocr": text,
+            "expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
+        })
+
+    return dataset


 def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
    parser.add_argument("--url", help="Single image URL to test")
    parser.add_argument("--category", default="photo", help="Category for single URL")
    parser.add_argument("--output", default=None, help="Output JSON file")
+    parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
+    parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
    parser.add_argument("--models", nargs="+", default=None,
                        help="Models to test (default: all)")
    parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
        print("ERROR: Provide --images or --url")
        sys.exit(1)

+    if args.limit and args.limit > 0:
+        images = images[:args.limit]
+
    # Run benchmark
    report = await run_benchmark_suite(images, selected, args.runs)

+    markdown_report = to_markdown(report)
+
    # Output
    if args.output:
        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
            json.dump(report, f, indent=2)
        print(f"\nResults saved to {args.output}")

+    if args.markdown_output:
+        os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
+        with open(args.markdown_output, "w", encoding="utf-8") as f:
+            f.write(markdown_report)
+        print(f"Markdown report saved to {args.markdown_output}")
+
    if args.markdown or not args.output:
-        print("\n" + to_markdown(report))
+        print("\n" + markdown_report)


 if __name__ == "__main__":
--- a/metrics/vision-benchmark-smoke-2026-04-22.json
+++ b/metrics/vision-benchmark-smoke-2026-04-22.json
@@ -0,0 +1,67 @@
+{
+  "generated_at": "2026-04-22T16:21:56.271426+00:00",
+  "config": {
+    "total_images": 2,
+    "runs_per_model": 1,
+    "models": {
+      "gemma4": "Gemma 4 27B",
+      "gemini3_flash": "Gemini 3 Flash Preview"
+    }
+  },
+  "results": [
+    {
+      "gemma4": {
+        "success": false,
+        "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
+        "runs": 0,
+        "errors": 1
+      },
+      "gemini3_flash": {
+        "success": false,
+        "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
+        "runs": 0,
+        "errors": 1
+      },
+      "image_id": "screenshot_github_mark",
+      "category": "screenshot"
+    },
+    {
+      "gemma4": {
+        "success": false,
+        "error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
+        "runs": 0,
+        "errors": 1
+      },
+      "gemini3_flash": {
+        "success": false,
+        "error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
+        "runs": 0,
+        "errors": 1
+      },
+      "image_id": "screenshot_github_social",
+      "category": "screenshot"
+    }
+  ],
+  "summary": {
+    "gemma4": {
+      "success_rate": 0,
+      "error": "All runs failed",
+      "total_runs": 0,
+      "total_failures": 2,
+      "failure_examples": [
+        "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
+        "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
+      ]
+    },
+    "gemini3_flash": {
+      "success_rate": 0,
+      "error": "All runs failed",
+      "total_runs": 0,
+      "total_failures": 2,
+      "failure_examples": [
+        "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
+        "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
+      ]
+    }
+  }
+}
--- a/metrics/vision-benchmark-smoke-2026-04-22.md
+++ b/metrics/vision-benchmark-smoke-2026-04-22.md
@@ -0,0 +1,44 @@
+# Vision Benchmark Report
+
+Generated: 2026-04-22T16:21
+Images tested: 2
+Runs per model: 1
+Models: Gemma 4 27B, Gemini 3 Flash Preview
+
+## Latency Comparison
+
+| Model | Mean (ms) | Median | P95 | Std Dev |
+|-------|-----------|--------|-----|---------|
+
+## Accuracy Comparison
+
+| Model | OCR Accuracy | Keyword Coverage | Success Rate |
+|-------|-------------|-----------------|--------------|
+
+## Token Usage
+
+| Model | Mean Tokens/Image | Total Tokens |
+|-------|------------------|--------------|
+
+## Failure Modes
+
+### Gemma 4 27B
+- Summary: All runs failed
+- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
+- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
+
+### Gemini 3 Flash Preview
+- Summary: All runs failed
+- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
+- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
+For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
+
+
+## Verdict
+
+Benchmark blocked or insufficient data for a trustworthy winner.
+
+Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.
--- a/research_human_confirmation_firewall.md
+++ b/research_human_confirmation_firewall.md
@@ -1,515 +0,0 @@
-# Human Confirmation Firewall: Research Report
-## Implementation Patterns for Hermes Agent
-
-**Issue:** #878  
-**Parent:** #659  
-**Priority:** P0  
-**Scope:** Human-in-the-loop safety patterns for tool calls, crisis handling, and irreversible actions
-
---
-
-## Executive Summary
-
-Hermes already has a partial human confirmation firewall, but it is narrow.
-
-Current repo state shows:
- a real **pre-execution gate** for dangerous terminal commands in `tools/approval.py`
- a partial **confidence-threshold path** via `_smart_approve()` in `tools/approval.py`
- gateway support for blocking approval resolution in `gateway/run.py`
-
-What is still missing is the core recommendation from this research issue:
- **confidence scoring on all tool calls**, not just terminal commands that already matched a dangerous regex
- a **hard pre-execution human gate for crisis interventions**, especially any action that would auto-respond to suicidal content
- a consistent way to classify actions into:
-  1. pre-execution gate
-  2. post-execution review
-  3. confidence-threshold execution
-
-Recommendation:
- use **Pattern 1: Pre-Execution Gate** for crisis interventions and irreversible/high-impact actions
- use **Pattern 3: Confidence Threshold** for normal operations
- reserve **Pattern 2: Post-Execution Review** only for low-risk and reversible actions
-
-The next implementation step should be a **tool-call risk assessment layer** that runs before dispatch in `model_tools.handle_function_call()`, assigns a score and pattern to every tool call, and routes only the highest-risk calls into mandatory human confirmation.
-
---
-
-## 1. The Three Proven Patterns
-
-### Pattern 1: Pre-Execution Gate
-
-Definition:
- halt before execution
- show the proposed action to the human
- require explicit approval or denial
-
-Best for:
- destructive actions
- irreversible side effects
- crisis interventions
- actions that affect another human's safety, money, infrastructure, or private data
-
-Strengths:
- strongest safety guarantee
- simplest audit story
- prevents the most catastrophic failure mode: acting first and apologizing later
-
-Weaknesses:
- adds latency
- creates operator burden if overused
- should not be applied to every ordinary tool call
-
-### Pattern 2: Post-Execution Review
-
-Definition:
- execute first
- expose result to human
- allow rollback or follow-up correction
-
-Best for:
- reversible operations
- low-risk actions with fast recovery
- tasks where human review matters but immediate execution is acceptable
-
-Strengths:
- low friction
- fast iteration
- useful when rollback is practical
-
-Weaknesses:
- unsafe for crisis or destructive actions
- only works when rollback actually exists
- a poor fit for external communication or life-safety contexts
-
-### Pattern 3: Confidence Threshold
-
-Definition:
- compute a risk/confidence score before execution
- auto-execute high-confidence safe actions
- request confirmation for lower-confidence or higher-risk actions
-
-Best for:
- mixed-risk tool ecosystems
- day-to-day operations where always-confirm would be too expensive
- systems with a large volume of ordinary, safe reads and edits
-
-Strengths:
- best balance of speed and safety
- scales across many tool types
- allows targeted human attention where it matters most
-
-Weaknesses:
- depends on a good scoring model
- weak scoring creates false negatives or unnecessary prompts
- must remain inspectable and debuggable
-
---
-
-## 2. What Hermes Already Has
-
-## 2.1 Existing Pre-Execution Gate for Dangerous Terminal Commands
-
-`tools/approval.py` already implements a real pre-execution confirmation path for dangerous shell commands.
-
-Observed components:
- `DANGEROUS_PATTERNS`
- `detect_dangerous_command()`
- `prompt_dangerous_approval()`
- `check_dangerous_command()`
- gateway queueing and resolution support in the same module
-
-This is already Pattern 1.
-
-Current behavior:
- dangerous terminal commands are detected before execution
- the user can allow once / session / always / deny
- gateway sessions can block until approval resolves
-
-This is a strong foundation, but it is limited to a subset of terminal commands.
-
-## 2.2 Partial Confidence Threshold via Smart Approvals
-
-Hermes also already has a partial Pattern 3.
-
-Observed component:
- `_smart_approve()` in `tools/approval.py`
-
-Current behavior:
- only runs **after** a command has already been flagged by dangerous-pattern detection
- uses the auxiliary LLM to decide:
-  - approve
-  - deny
-  - escalate
-
-This means Hermes has a confidence-threshold mechanism, but only for **already-flagged dangerous terminal commands**.
-
-What it does not yet do:
- score all tool calls
- classify non-terminal tools
- distinguish crisis interventions from normal ops
- produce a shared risk model across the tool surface
-
-## 2.3 Blocking Approval UX in Gateway
-
-`gateway/run.py` already routes `/approve` and `/deny` into the blocking approval path.
-
-This means the infrastructure for a true human confirmation firewall already exists in messaging contexts.
-
-That is important because the missing work is not "invent human approval from zero."
-The missing work is:
- expand the scope from dangerous shell commands to **all tool calls that matter**
- make the routing policy explicit and inspectable
-
---
-
-## 3. What Hermes Still Lacks
-
-## 3.1 No Universal Tool-Call Risk Assessment
-
-The current approval system is command-pattern-centric.
-It is not yet a tool-call firewall.
-
-Missing capability:
- before dispatch, every tool call should receive a structured assessment:
-  - tool name
-  - side-effect class
-  - reversibility
-  - human-impact potential
-  - crisis relevance
-  - confidence score
-  - recommended confirmation pattern
-
-Natural insertion point:
- `model_tools.handle_function_call()`
-
-That function already sits at the central dispatch boundary.
-It is the right place to add a pre-dispatch classifier.
-
-## 3.2 No Hard Crisis Gate for Outbound Intervention
-
-Issue #878 explicitly recommends:
- Pattern 1 for crisis interventions
- never auto-respond to suicidal content
-
-That recommendation is not yet codified as a global firewall rule.
-
-Missing rule:
- if a tool call would directly intervene in a crisis context or send outward guidance in response to suicidal content, it must require explicit human confirmation before execution
-
-Examples that should hard-gate:
- outbound `send_message` content aimed at a suicidal user
- any future tool that places calls, escalates emergencies, or contacts third parties about a crisis
- any autonomous action that claims a person should or should not take a life-safety step
-
-## 3.3 No First-Class Post-Execution Review Policy
-
-Hermes has approval and denial, but it does not yet have a formal policy for when Pattern 2 is acceptable.
-
-Without a policy, post-execution review tends to get used implicitly rather than intentionally.
-
-That is risky.
-
-Hermes should define Pattern 2 narrowly:
- only for actions that are both low-risk and reversible
- only when the system can show the human exactly what happened
- never for crisis, finance, destructive config, or sensitive comms
-
---
-
-## 4. Recommended Architecture for Hermes
-
-## 4.1 Add a Tool-Call Assessment Layer
-
-Add a pre-dispatch assessment object for every tool call.
-
-Suggested shape:
-
-```python
-@dataclass
-class ToolCallAssessment:
-    tool_name: str
-    risk_score: float          # 0.0 to 1.0
-    confidence: float          # confidence in the assessment itself
-    pattern: str               # pre_execution_gate | post_execution_review | confidence_threshold
-    requires_human: bool
-    reasons: list[str]
-    reversible: bool
-    crisis_sensitive: bool
-```
-
-Suggested execution point:
- inside `model_tools.handle_function_call()` before `orchestrator.dispatch()`
-
-Why here:
- one place covers all tools
- one place can emit traces
- one place can remain model-agnostic
- one place lets plugins observe or override the assessment
-
-## 4.2 Classify Tool Calls by Side-Effect Class
-
-Suggested first-pass taxonomy:
-
-### A. Read-only
-Examples:
- `read_file`
- `search_files`
- `browser_snapshot`
- `browser_console` read-only inspection
-
-Pattern:
- confidence threshold
- almost always auto-execute
- human confirmation normally unnecessary
-
-### B. Local reversible edits
-Examples:
- `patch`
- `write_file`
- `todo`
-
-Pattern:
- confidence threshold
- human confirmation only when risk score rises because of path sensitivity or scope breadth
-
-### C. External side effects
-Examples:
- `send_message`
- `cronjob`
- `delegate_task`
- smart-home actuation tools
-
-Pattern:
- confidence threshold by default
- pre-execution gate when score exceeds threshold or when context is sensitive
-
-### D. Critical / destructive / crisis-sensitive
-Examples:
- dangerous `terminal`
- financial actions
- deletion / kill / restart / deployment in sensitive paths
- outbound crisis intervention
-
-Pattern:
- pre-execution gate
- never auto-execute on confidence alone
-
-## 4.3 Crisis Override Rule
-
-Add a hard override:
-
-```text
-If tool call is crisis-sensitive AND outbound or irreversible:
-    requires_human = True
-    pattern = pre_execution_gate
-```
-
-This is the most important rule in the issue.
-
-The model may draft the message.
-The human must confirm before the system sends it.
-
-## 4.4 Use Confidence Threshold for Normal Ops
-
-For non-crisis operations, use Pattern 3.
-
-Suggested logic:
- low risk + high assessment confidence -> auto-execute
- medium risk or medium confidence -> ask human
- high risk -> always ask human
-
-Key point:
- confidence is not just "how sure the LLM is"
- confidence should combine:
-  - tool type certainty
-  - argument clarity
-  - path sensitivity
-  - external side effects
-  - crisis indicators
-
---
-
-## 5. Recommended Initial Scoring Factors
-
-A simple initial scorer is enough.
-It does not need to be fancy.
-
-Suggested factors:
-
-### 5.1 Tool class risk
- read-only tools: very low base risk
- local mutation tools: moderate base risk
- external communication / automation tools: higher base risk
- shell execution: variable, often high
-
-### 5.2 Target sensitivity
-Examples:
- `/tmp` or local scratch paths -> lower
- repo files under git -> medium
- system config, credentials, secrets, gateway lifecycle -> high
- human-facing channels -> high if message content is sensitive
-
-### 5.3 Reversibility
- reversible -> lower
- difficult but possible to undo -> medium
- practically irreversible -> high
-
-### 5.4 Human-impact content
- no direct human impact -> low
- administrative impact -> medium
- crisis / safety / emotional intervention -> critical
-
-### 5.5 Context certainty
- arguments are explicit and narrow -> higher confidence
- arguments are vague, inferred, or broad -> lower confidence
-
---
-
-## 6. Implementation Plan
-
-## Phase 1: Assessment Without Behavior Change
-
-Goal:
- score all tool calls
- log assessment decisions
- emit traces for review
- do not yet block new tool categories
-
-Files to touch:
- `tools/approval.py`
- `model_tools.py`
- tests for assessment coverage
-
-Output:
- risk/confidence trace for every tool call
- pattern recommendation for every tool call
-
-Why first:
- lets us calibrate before changing runtime behavior
- avoids breaking existing workflows blindly
-
-## Phase 2: Hard-Gate Crisis-Sensitive Outbound Actions
-
-Goal:
- enforce Pattern 1 for crisis interventions
-
-Likely surfaces:
- `send_message`
- any future telephony / call / escalation tools
- other tools with direct human intervention side effects
-
-Rule:
- never auto-send crisis intervention content without human confirmation
-
-## Phase 3: General Confidence Threshold for Normal Ops
-
-Goal:
- apply Pattern 3 to all tool calls
- auto-run clearly safe actions
- escalate ambiguous or medium-risk actions
-
-Likely thresholds:
- score < 0.25 -> auto
- 0.25 to 0.60 -> confirm if confidence is weak
- > 0.60 -> confirm
- crisis-sensitive -> always confirm
-
-## Phase 4: Optional Post-Execution Review Lane
-
-Goal:
- allow Pattern 2 only for explicitly reversible operations
-
-Examples:
- maybe low-risk messaging drafts saved locally
- maybe reversible UI actions in specific environments
-
-Important:
- this phase is optional
- Hermes should not rely on Pattern 2 for safety-critical flows
-
---
-
-## 7. Verification Criteria for the Future Implementation
-
-The eventual implementation should prove all of the following:
-
-1. every tool call receives a scored assessment before dispatch
-2. crisis-sensitive outbound actions always require human confirmation
-3. dangerous terminal commands still preserve their current pre-execution gate
-4. clearly safe read-only tool calls are not slowed by unnecessary prompts
-5. assessment traces can be inspected after a run
-6. approval decisions remain session-safe across CLI and gateway contexts
-
---
-
-## 8. Concrete Recommendations
-
-### Recommendation 1
-Do **not** replace the current dangerous-command approval path.
-Generalize above it.
-
-Why:
- existing terminal Pattern 1 already works
- this is the strongest piece of the current firewall
-
-### Recommendation 2
-Add a universal scorer in `model_tools.handle_function_call()`.
-
-Why:
- that is the first point where Hermes knows the tool name and structured arguments
- it is the cleanest place to classify all tool calls uniformly
-
-### Recommendation 3
-Treat crisis-sensitive outbound intervention as a separate safety class.
-
-Why:
- issue #878 explicitly calls for Pattern 1 here
- this matches Timmy's SOUL-level safety requirements
-
-### Recommendation 4
-Ship scoring traces before enforcement expansion.
-
-Why:
- you cannot tune thresholds you cannot inspect
- false positives will otherwise frustrate normal usage
-
-### Recommendation 5
-Use Pattern 3 as the default policy for normal operations.
-
-Why:
- full manual confirmation on every tool call is too expensive
- full autonomy is too risky
- Pattern 3 is the practical middle ground
-
---
-
-## 9. Bottom Line
-
-Hermes should implement a **two-track human confirmation firewall**:
-
-1. **Pattern 1: Pre-Execution Gate**
-   - crisis interventions
-   - destructive terminal actions
-   - irreversible or safety-critical tool calls
-
-2. **Pattern 3: Confidence Threshold**
-   - all ordinary tool calls
-   - driven by a universal tool-call assessment layer
-   - integrated at the central dispatch boundary
-
-Pattern 2 should remain optional and narrow.
-It is not the primary answer for Hermes.
-
-The repo already contains the beginnings of this system.
-The next step is not new theory.
-It is to turn the existing approval path into a true **tool-call-wide human confirmation firewall**.
-
---
-
-## References
-
- Issue #878 — Human Confirmation Firewall Implementation Patterns
- Issue #659 — Critical Research Tasks
- `tools/approval.py` — current dangerous-command approval flow and smart approvals
- `model_tools.py` — central tool dispatch boundary
- `gateway/run.py` — blocking approval handling for messaging sessions
--- a/tests/test_vision_benchmark.py
+++ b/tests/test_vision_benchmark.py
@@ -199,7 +199,7 @@ class TestMarkdown:
 class TestDataset:
    def test_sample_dataset_has_entries(self):
        dataset = generate_sample_dataset()
-        assert len(dataset) >= 4
+        assert len(dataset) >= 50

    def test_sample_dataset_structure(self):
        dataset = generate_sample_dataset()
@@ -216,6 +216,9 @@ class TestDataset:
        assert "screenshot" in categories
        assert "diagram" in categories
        assert "photo" in categories
+        assert "chart" in categories
+        assert "ocr" in categories
+        assert "document" in categories


 class TestModels:
--- a/tests/test_vision_benchmark_artifacts.py
+++ b/tests/test_vision_benchmark_artifacts.py
@@ -0,0 +1,21 @@
+import json
+from pathlib import Path
+
+
+DATASET = Path("benchmarks/test_images.json")
+REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
+
+
+def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
+    items = json.loads(DATASET.read_text(encoding="utf-8"))
+    assert len(items) >= 50
+    categories = {item["category"] for item in items}
+    assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
+
+
+def test_metrics_report_exists_with_recommendation() -> None:
+    assert REPORT.exists(), "missing benchmark report under metrics/"
+    text = REPORT.read_text(encoding="utf-8")
+    assert "Recommendation" in text
+    assert "Gemma 4" in text
+    assert "Gemini" in text
Author	SHA1	Message	Date
Alexander Whitestone	9d05f77a9b	feat: harden vision benchmark artifacts All checks were successful Lint / lint (pull_request) Successful in 9s Details Refs #817	2026-04-22 12:22:28 -04:00
Alexander Whitestone	23e093fc75	wip: tighten vision benchmark acceptance tests	2026-04-22 12:10:23 -04:00
Alexander Whitestone	f77ce4dff2	wip: add regression tests for vision benchmark artifacts	2026-04-22 12:07:52 -04:00