Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
9d05f77a9b feat: harden vision benchmark artifacts
All checks were successful
Lint / lint (pull_request) Successful in 9s
Refs #817
2026-04-22 12:22:28 -04:00
Alexander Whitestone
23e093fc75 wip: tighten vision benchmark acceptance tests 2026-04-22 12:10:23 -04:00
Alexander Whitestone
f77ce4dff2 wip: add regression tests for vision benchmark artifacts 2026-04-22 12:07:52 -04:00
9 changed files with 1137 additions and 1103 deletions

View File

@@ -1,194 +1,757 @@
[
{
"id": "screenshot_github_home",
"id": "screenshot_github_mark",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"id": "screenshot_github_social",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_github_code_search",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_terminal_capture",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_http_404",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_01",
"url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
"category": "screenshot",
"expected_keywords": [
"hermes",
"cli",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_02",
"url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
"category": "screenshot",
"expected_keywords": [
"prompt",
"cache",
"dashboard"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_01",
"url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
"category": "screenshot",
"expected_keywords": [
"settings",
"voice",
"mode"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_02",
"url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
"category": "screenshot",
"expected_keywords": [
"browser",
"vision",
"preview"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_03",
"url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
"category": "screenshot",
"expected_keywords": [
"tool",
"call",
"inspector"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_flow_a",
"url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flowchart",
"api",
"worker"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_flow_b",
"url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
"category": "diagram",
"expected_keywords": [
"architecture",
"diagram",
"cache"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_uml_a",
"url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
"category": "diagram",
"expected_keywords": [
"class",
"diagram",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"id": "diagram_uml_b",
"url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
"category": "diagram",
"expected_keywords": [
"sequence",
"diagram",
"response"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"id": "diagram_network_a",
"url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
"category": "diagram",
"expected_keywords": [
"network",
"node",
"router"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_network_b",
"url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
"category": "diagram",
"expected_keywords": [
"service",
"mesh",
"auth"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_state_machine",
"url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
"category": "diagram",
"expected_keywords": [
"state",
"machine",
"idle"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_mind_map",
"url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
"category": "diagram",
"expected_keywords": [
"mind",
"memory",
"tools"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_pipeline",
"url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
"category": "diagram",
"expected_keywords": [
"pipeline",
"ingest",
"summarize"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"org",
"chart",
"review"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"id": "photo_random_01",
"url": "https://picsum.photos/seed/vision-bench-1/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"id": "photo_random_02",
"url": "https://picsum.photos/seed/vision-bench-2/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_03",
"url": "https://picsum.photos/seed/vision-bench-3/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_04",
"url": "https://picsum.photos/seed/vision-bench-4/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_05",
"url": "https://picsum.photos/seed/vision-bench-5/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_06",
"url": "https://picsum.photos/seed/vision-bench-6/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_07",
"url": "https://picsum.photos/seed/vision-bench-7/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_08",
"url": "https://picsum.photos/seed/vision-bench-8/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_09",
"url": "https://picsum.photos/seed/vision-bench-9/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_10",
"url": "https://picsum.photos/seed/vision-bench-10/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_bar_quarterly",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie_market",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_line_temp",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"id": "chart_radar_skill",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"id": "chart_stacked_cloud",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"id": "chart_area_growth",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
"category": "chart",
"expected_keywords": [
"line",
"growth",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"id": "chart_scatter_eval",
"url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
"category": "chart",
"expected_keywords": [
"scatter",
"chart",
"runs"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_horizontal_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
"category": "chart",
"expected_keywords": [
"bar",
"score",
"ocr"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_bubble_usage",
"url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
"category": "chart",
"expected_keywords": [
"bubble",
"latency",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_doughnut_devices",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "ocr_text_01",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
"category": "ocr",
"expected_keywords": [
"hermes",
"ocr"
],
"ground_truth_ocr": "Hermes OCR Alpha 01",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_02",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
"category": "ocr",
"expected_keywords": [
"prompt",
"cache"
],
"ground_truth_ocr": "Prompt Cache Hit 87%",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_03",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
"category": "ocr",
"expected_keywords": [
"session",
"42"
],
"ground_truth_ocr": "Session 42 Ready",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_04",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
"category": "ocr",
"expected_keywords": [
"latency",
"118"
],
"ground_truth_ocr": "Latency 118 ms",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_05",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
"category": "ocr",
"expected_keywords": [
"voice",
"mode"
],
"ground_truth_ocr": "Voice Mode Enabled",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_01",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
"category": "document",
"expected_keywords": [
"invoice",
"1001",
"total"
],
"ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_02",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
"category": "document",
"expected_keywords": [
"form",
"a",
"name"
],
"ground_truth_ocr": "Form A Name Alice Status Approved",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_03",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
"category": "document",
"expected_keywords": [
"report",
"memory",
"recall"
],
"ground_truth_ocr": "Report Memory Recall Score 91 Percent",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_04",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
"category": "document",
"expected_keywords": [
"checklist",
"crisis",
"escalation"
],
"ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_05",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
"category": "document",
"expected_keywords": [
"meeting",
"notes",
"vision"
],
"ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
}
]
]

View File

@@ -22,10 +22,12 @@ import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"fallback_provider": "ollama",
"fallback_model_id": "gemma4:latest",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"fallback_provider": "gemini",
"fallback_model_id": "gemini-2.5-flash",
"description": "Current default vision model",
},
}
@@ -84,91 +90,150 @@ async def analyze_with_model(
"""
import httpx
def _load_image_bytes_cached() -> tuple[bytes, str]:
nonlocal _image_bytes, _mime_type
if _image_bytes is not None:
return _image_bytes, _mime_type
if image_url.startswith(("http://", "https://")):
with urllib.request.urlopen(image_url, timeout=30) as resp:
_image_bytes = resp.read()
_mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
else:
path = Path(image_url).expanduser()
_image_bytes = path.read_bytes()
_mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
return _image_bytes, _mime_type
def _data_url() -> str:
image_bytes, mime_type = _load_image_bytes_cached()
return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
def _provider_key(provider: str) -> str:
if provider == "openrouter":
return os.getenv("OPENROUTER_API_KEY", "")
if provider == "nous":
return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
if provider == "gemini":
return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
return os.getenv(f"{provider.upper()}_API_KEY", "")
provider = model_config["provider"]
model_id = model_config["model_id"]
candidates = [(provider, model_id)]
if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
_image_bytes: Optional[bytes] = None
_mime_type = "image/png"
failures = []
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
for candidate_provider, candidate_model in candidates:
api_key = _provider_key(candidate_provider)
start = time.perf_counter()
try:
if candidate_provider in {"openrouter", "nous"}:
api_url = (
"https://openrouter.ai/api/v1/chat/completions"
if candidate_provider == "openrouter"
else "https://inference.nousresearch.com/v1/chat/completions"
)
if not api_key:
raise RuntimeError(f"No API key for provider {candidate_provider}")
payload = {
"model": candidate_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
],
}],
"max_tokens": 2000,
"temperature": 0.1,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
elif candidate_provider == "gemini":
if not api_key:
raise RuntimeError("No API key for provider gemini")
image_bytes, mime_type = _load_image_bytes_cached()
api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
payload = {
"contents": [{"parts": [
{"text": prompt},
{"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
]}],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload)
resp.raise_for_status()
data = resp.json()
parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
usage = data.get("usageMetadata", {})
tokens = {
"prompt_tokens": usage.get("promptTokenCount", 0),
"completion_tokens": usage.get("candidatesTokenCount", 0),
"total_tokens": usage.get("totalTokenCount", 0),
}
elif candidate_provider == "ollama":
image_bytes, _ = _load_image_bytes_cached()
payload = {
"model": candidate_model,
"stream": False,
"messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
"options": {"temperature": 0.1},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
analysis = data.get("message", {}).get("content", "")
tokens = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
}
else:
raise RuntimeError(f"Unsupported provider {candidate_provider}")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
latency_ms = (time.perf_counter() - start) * 1000
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
"provider_used": candidate_provider,
"model_used": candidate_model,
}
except Exception as e:
failures.append(f"{candidate_provider}:{candidate_model} => {e}")
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": " | ".join(failures) if failures else "No runs",
"provider_used": candidates[-1][0] if candidates else provider,
"model_used": candidates[-1][1] if candidates else model_id,
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
summary[model_name] = {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
lines += ["", "## Failure Modes", ""]
had_failures = False
for mkey, mname in config["models"].items():
model_summary = summary.get(mkey, {})
failure_examples = model_summary.get("failure_examples", [])
if not failure_examples and not model_summary.get("error"):
continue
had_failures = True
lines.append(f"### {mname}")
if model_summary.get("error"):
lines.append(f"- Summary: {model_summary['error']}")
for err in failure_examples:
lines.append(f"- {err}")
lines.append("")
if not had_failures:
lines.append("- No provider/runtime failures recorded.")
# Verdict
lines += ["", "## Verdict", ""]
@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
lines.append("")
lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
else:
lines.append("No clear winner — insufficient data.")
lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
lines.append("")
lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")
return "\n".join(lines)
@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:
def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images.
"""Generate a larger benchmark dataset aligned with issue #817.
Returns list of test image definitions.
Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
and document-like images so the harness matches the issue contract.
"""
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
dataset: List[dict] = []
screenshots = [
("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
]
for ident, url, keywords in screenshots:
dataset.append({
"id": f"screenshot_{ident}",
"url": url,
"category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
diagrams = [
("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
]
for ident, url, keywords in diagrams:
dataset.append({
"id": f"diagram_{ident}",
"url": url,
"category": "diagram",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
})
for idx in range(1, 11):
dataset.append({
"id": f"photo_random_{idx:02d}",
"url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
charts = [
("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
]
for ident, url, keywords in charts:
dataset.append({
"id": f"chart_{ident}",
"url": url,
"category": "chart",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
})
ocr_texts = [
"Hermes OCR Alpha 01",
"Prompt Cache Hit 87%",
"Session 42 Ready",
"Latency 118 ms",
"Voice Mode Enabled",
]
for idx, text in enumerate(ocr_texts, start=1):
dataset.append({
"id": f"ocr_text_{idx:02d}",
"url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
"category": "ocr",
"expected_keywords": text.lower().split()[:2],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
documents = [
"Invoice 1001 Total 42 Due 2026-04-22",
"Form A Name Alice Status Approved",
"Report Memory Recall Score 91 Percent",
"Checklist Crisis Escalation Call 988 Now",
"Meeting Notes Vision Benchmark Run Pending",
]
for idx, text in enumerate(documents, start=1):
dataset.append({
"id": f"document_text_{idx:02d}",
"url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
"category": "document",
"expected_keywords": text.lower().split()[:3],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
return dataset
def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
print("ERROR: Provide --images or --url")
sys.exit(1)
if args.limit and args.limit > 0:
images = images[:args.limit]
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
markdown_report = to_markdown(report)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown_output:
os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
with open(args.markdown_output, "w", encoding="utf-8") as f:
f.write(markdown_report)
print(f"Markdown report saved to {args.markdown_output}")
if args.markdown or not args.output:
print("\n" + to_markdown(report))
print("\n" + markdown_report)
if __name__ == "__main__":

View File

@@ -1,387 +0,0 @@
# Morning Review Packet
Source epic: [EPIC: Morning review packet — Hermes harness features landed 2026-04-21](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/949)
## Epic context
EPIC: Morning review packet — Hermes harness features landed 2026-04-21
Source: git log on upstream/main since 2026-04-21 00:00 EDT, plus the current local branch `burn/921-poka-yoke-hardcoded-paths` for the branch-only path-guard work.
Important review note:
- Validate upstream-landed features on `upstream/main` or a synced branch.
- Validate the path-guard work on `burn/921-poka-yoke-hardcoded-paths`.
This epic is a morning-review packet: one QA issue per feature cluster, each with concrete acceptance criteria and targeted tests or manual checks.
## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
- [ ] Test output or manual evidence is attached to each issue.
- [ ] Any drift between upstream/main and forge/main is called out explicitly.
## Sub-issues
### Upstream/main features landed 2026-04-21
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [ ] #952 [QA] Verify CLI voice beep toggle
- [ ] #953 [QA] Verify bundled skill scripts run out of the box
- [ ] #954 [QA] Verify maps skill guest_house / camp_site / bakery expansion
- [ ] #955 [QA] Verify KittenTTS local provider end-to-end
- [ ] #956 [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
- [ ] #957 [QA] Verify optional adversarial-ux-test skill catalog flow
- [ ] #958 [QA] Verify /usage account limits in CLI + gateway
- [ ] #959 [QA] Verify OpenCode-Go curated catalog additions
- [ ] #960 [QA] Verify patch 'did you mean?' suggestions
- [ ] #961 [QA] Verify web dashboard update/restart action buttons
### Local branch-only work
- [ ] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
## Summary
| Issue | State | Commits | Tests |
| --- | --- | --- | --- |
| #950 | open | 5 | 2 |
| #951 | open | 2 | 2 |
| #952 | open | 1 | 1 |
| #953 | open | 1 | 2 |
| #954 | open | 1 | 0 |
| #955 | open | 2 | 1 |
| #956 | open | 1 | 0 |
| #957 | open | 1 | 0 |
| #958 | open | 2 | 2 |
| #959 | open | 1 | 1 |
| #960 | open | 2 | 1 |
| #961 | closed | 1 | 0 |
| #962 | closed | 1 | 1 |
## #950 — [QA] Verify AI Gateway provider UX + attribution headers
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/950
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b11753879` — attribution default_headers for ai-gateway provider
- `700437440` — curated picker with live pricing
- `ac26a460f` — promote ai-gateway in provider picker ordering
- `5bb2d11b0` — auto-promote free Moonshot models
- `29f57ec95` — Vercel deep-link for API key creation
### Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
### Tasks
- [ ] Open `hermes model` and verify `ai-gateway` appears near the top.
- [ ] Verify live pricing appears in the picker.
- [ ] Verify free Moonshot models are promoted.
- [ ] Trigger API-key setup flow and verify the Vercel deep link.
- [ ] Send one ai-gateway request and verify attribution headers are attached.
### Acceptance criteria
- [ ] UI ordering and pricing match the landed behavior.
- [ ] Attribution headers are present on ai-gateway requests.
- [ ] Targeted tests pass.
## #951 — [QA] Verify transport abstraction + AnthropicTransport wiring
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/951
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `7ab5eebd0` — transport types + Anthropic normalize migration
- `731f4fbae` — transport ABC + AnthropicTransport wired to all paths
### Targeted tests
- `tests/agent/transports/test_types.py`
- `tests/agent/test_anthropic_normalize_v2.py`
### Tasks
- [ ] Verify plain-text Anthropic responses normalize correctly.
- [ ] Verify tool-call responses preserve IDs, names, and arguments.
- [ ] Verify reasoning/thinking is preserved separately from visible content.
- [ ] Verify finish_reason mapping remains correct across paths.
### Acceptance criteria
- [ ] Normalized response shape is stable.
- [ ] Tool-call and reasoning payloads survive normalization.
- [ ] Targeted tests pass.
## #952 — [QA] Verify CLI voice beep toggle
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/952
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b48ea41d2` — voice: add CLI beep toggle
### Targeted tests
- `tests/tools/test_voice_cli_integration.py`
### Tasks
- [ ] Enable the beep option in config and confirm voice mode emits the beep.
- [ ] Disable the option and confirm the same path is silent.
- [ ] Verify voice mode still strips markdown before speech output.
- [ ] Verify voice mode does not pollute conversation history with TTS-only text.
### Acceptance criteria
- [ ] Beep behavior is actually toggled by config.
- [ ] Existing voice/TTS integration behavior is not regressed.
- [ ] Targeted tests pass.
## #953 — [QA] Verify bundled skill scripts run out of the box
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/953
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `328223576` — make bundled skill scripts runnable out of the box
### Targeted tests
- `tests/agent/test_skill_commands.py`
- `tests/tools/test_local_shell_init.py`
### Tasks
- [ ] Pick a bundled skill that ships a script and run it without manual chmod/PATH surgery.
- [ ] Verify local terminal execution resolves the installed skill script correctly.
- [ ] Verify local shell init still behaves correctly.
### Acceptance criteria
- [ ] Bundled skill scripts execute from the installed skill location with no manual prep.
- [ ] Local shell init remains healthy.
- [ ] Targeted tests pass.
## #954 — [QA] Verify maps skill guest_house / camp_site / bakery expansion
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/954
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `c5a814b23` — maps: add guest_house, camp_site, and dual-key bakery lookup
### Tasks
- [ ] Use the maps skill to search for a guest house in a known populated area.
- [ ] Use the maps skill to search for a camp site in a known populated area.
- [ ] Use the maps skill to search for a bakery and verify both supported keys resolve correctly.
- [ ] Confirm results are sensible and non-empty.
### Acceptance criteria
- [ ] All three place types resolve correctly.
- [ ] Bakery lookup works through both supported keys.
- [ ] Manual evidence is attached in the issue.
## #955 — [QA] Verify KittenTTS local provider end-to-end
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/955
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `1830ebfc5` — add KittenTTS provider
- `2d7ff9c5b` — complete KittenTTS integration across tools/setup/docs/tests
### Targeted tests
- `tests/tools/test_tts_kittentts.py`
### Tasks
- [ ] Configure TTS to use `kittentts`.
- [ ] Generate speech to `.wav` and verify playable output.
- [ ] Verify voice / speed / cleaned text are passed correctly.
- [ ] Generate repeated requests and verify model caching behavior.
- [ ] Generate a non-wav output and verify ffmpeg conversion path.
- [ ] Verify missing-package behavior returns a helpful error.
### Acceptance criteria
- [ ] KittenTTS works end-to-end when installed.
- [ ] Failure mode is operator-friendly when not installed.
- [ ] Targeted tests pass.
## #956 — [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/956
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `d1ed6f4fb` — CLI: add numbered keyboard shortcuts to approval and clarify prompts
### Tasks
- [ ] Trigger an approval prompt and choose an option with number keys.
- [ ] Trigger a clarify prompt and choose an option with number keys.
- [ ] Verify the correct option is submitted both times.
- [ ] Verify normal keyboard navigation still works.
### Acceptance criteria
- [ ] Number-key selection works for both prompt types.
- [ ] Legacy keyboard navigation is not broken.
- [ ] Manual evidence is attached in the issue.
## #957 — [QA] Verify optional adversarial-ux-test skill catalog flow
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/957
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `e50e7f11b` — skills: add adversarial-ux-test optional skill
### Tasks
- [ ] Verify the optional skill appears in the optional skill catalog.
- [ ] Install or enable the skill.
- [ ] Load it successfully through Hermes.
- [ ] Disable or remove it and verify catalog state updates cleanly.
### Acceptance criteria
- [ ] Catalog listing is correct.
- [ ] Install / load / disable lifecycle works cleanly.
- [ ] Manual evidence is attached in the issue.
## #958 — [QA] Verify /usage account limits in CLI + gateway
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/958
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `8a11b0a20` — per-provider account limits module
- `bcc5d7b67` — append account limits section in CLI and gateway
### Targeted tests
- `tests/test_account_usage.py`
- `tests/gateway/test_usage_command.py`
### Tasks
- [ ] Run `/usage` in CLI for a provider with account limits.
- [ ] Verify provider, remaining quota, total limit, and reset window render correctly.
- [ ] Run `/usage` through the gateway and verify the same section appears.
- [ ] Verify zero-value cache read/write sections stay hidden when appropriate.
### Acceptance criteria
- [ ] CLI and gateway both show the landed account-limits section correctly.
- [ ] Targeted tests pass.
## #959 — [QA] Verify OpenCode-Go curated catalog additions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/959
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `4fea1769d` — opencode-go: add Kimi K2.6 and Qwen3.5/3.6 Plus to curated catalog
### Targeted tests
- `tests/hermes_cli/test_opencode_go_in_model_list.py`
### Tasks
- [ ] With valid OpenCode-Go credentials, open `hermes model`.
- [ ] Verify Kimi K2.6 appears.
- [ ] Verify Qwen 3.5 Plus and 3.6 Plus appear.
- [ ] Unset credentials and verify the provider/catalog hides correctly.
### Acceptance criteria
- [ ] New curated models are present when credentials exist.
- [ ] Catalog visibility still respects credential gating.
- [ ] Targeted tests pass.
## #960 — [QA] Verify patch 'did you mean?' suggestions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/960
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `15abf4ed8` — add `did you mean?` feedback when patch fails to match
- `5e6427a42` — gate it to true no-match cases and extend to v4a / skill_manage
### Targeted tests
- `tests/tools/test_fuzzy_match.py`
### Tasks
- [ ] Intentionally run a replace/patch with a near-miss `old_string`.
- [ ] Verify the tool suggests a useful nearby line/context.
- [ ] Verify suggestions only appear on true no-match failures.
- [ ] Verify the behavior also works via file tools, v4a patching, and skill_manage.
### Acceptance criteria
- [ ] Suggestion quality is helpful, not noisy.
- [ ] Suggestions are correctly gated to no-match cases.
- [ ] Targeted tests pass.
## #961 — [QA] Verify web dashboard update/restart action buttons
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/961
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `fc21c1420` — add buttons to update Hermes and restart gateway
### Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
### Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment and verify running/output/success-or-failure states render.
- [ ] Click Update Hermes and verify the same action lifecycle.
- [ ] Verify the page remains responsive while actions are running.
### Acceptance criteria
- [ ] Both action buttons are present and wired.
- [ ] Action status polling and result rendering work end-to-end.
- [ ] Manual evidence is attached in the issue.
## #962 — [QA] Verify hardcoded-home path guard on burn/921 branch
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/962
### Branch / checkout
- Validate specifically on `burn/921-poka-yoke-hardcoded-paths` (not upstream/main).
### Commits
- `5dcb90531` — Poka-yoke: prevent hardcoded home-directory paths
### Targeted tests
- `tests/test_path_guard.py`
### Tasks
- [ ] Verify hardcoded `/Users/...` paths are rejected.
- [ ] Verify hardcoded `~/.hermes/...` paths are rejected in guarded contexts.
- [ ] Verify valid relative paths still pass.
- [ ] Verify appropriate absolute paths still pass where intended.
- [ ] Verify linting catches violations in non-test files.
### Acceptance criteria
- [ ] Guard blocks the dangerous patterns and preserves allowed ones.
- [ ] Targeted tests pass.

View File

@@ -0,0 +1,67 @@
{
"generated_at": "2026-04-22T16:21:56.271426+00:00",
"config": {
"total_images": 2,
"runs_per_model": 1,
"models": {
"gemma4": "Gemma 4 27B",
"gemini3_flash": "Gemini 3 Flash Preview"
}
},
"results": [
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_mark",
"category": "screenshot"
},
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_social",
"category": "screenshot"
}
],
"summary": {
"gemma4": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
]
},
"gemini3_flash": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
]
}
}
}

View File

@@ -0,0 +1,44 @@
# Vision Benchmark Report
Generated: 2026-04-22T16:21
Images tested: 2
Runs per model: 1
Models: Gemma 4 27B, Gemini 3 Flash Preview
## Latency Comparison
| Model | Mean (ms) | Median | P95 | Std Dev |
|-------|-----------|--------|-----|---------|
## Accuracy Comparison
| Model | OCR Accuracy | Keyword Coverage | Success Rate |
|-------|-------------|-----------------|--------------|
## Token Usage
| Model | Mean Tokens/Image | Total Tokens |
|-------|------------------|--------------|
## Failure Modes
### Gemma 4 27B
- Summary: All runs failed
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
### Gemini 3 Flash Preview
- Summary: All runs failed
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
## Verdict
Benchmark blocked or insufficient data for a trustworthy winner.
Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.

View File

@@ -1,301 +0,0 @@
#!/usr/bin/env python3
"""Build a morning review packet from a Gitea epic and its child QA issues.
This script fetches a parent epic plus its sub-issues, extracts the structured
sections from each QA issue body, and renders a single markdown packet suitable
for morning review.
Usage:
python scripts/morning_review_packet.py --epic-number 949
python scripts/morning_review_packet.py --epic-number 949 --children 950-962
python scripts/morning_review_packet.py --epic-number 949 --output docs/review_packets/hermes-harness-2026-04-21.md
"""
from __future__ import annotations
import argparse
import json
import os
import re
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable
DEFAULT_BASE_URL = "https://forge.alexanderwhitestone.com"
DEFAULT_OWNER = "Timmy_Foundation"
DEFAULT_REPO = "hermes-agent"
DEFAULT_TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
@dataclass(frozen=True)
class CommitEvidence:
sha: str
summary: str
@dataclass
class ReviewIssue:
number: int
title: str
state: str
url: str
comments: int = 0
parent_issue: int | None = None
checkout_notes: list[str] = field(default_factory=list)
commits: list[CommitEvidence] = field(default_factory=list)
targeted_tests: list[str] = field(default_factory=list)
files_touched: list[str] = field(default_factory=list)
tasks: list[str] = field(default_factory=list)
acceptance_criteria: list[str] = field(default_factory=list)
def parse_issue_number_spec(spec: str) -> list[int]:
"""Parse a comma-separated issue list like ``950-952,955,962``."""
numbers: list[int] = []
seen: set[int] = set()
for chunk in (part.strip() for part in spec.split(",")):
if not chunk:
continue
if "-" in chunk:
start_str, end_str = (part.strip() for part in chunk.split("-", 1))
start = int(start_str)
end = int(end_str)
if end < start:
raise ValueError(f"Invalid descending issue range: {chunk}")
for number in range(start, end + 1):
if number not in seen:
numbers.append(number)
seen.add(number)
else:
number = int(chunk)
if number not in seen:
numbers.append(number)
seen.add(number)
return numbers
def _parse_sections(body: str) -> dict[str, list[str]]:
sections: dict[str, list[str]] = {}
current: str | None = None
for raw_line in body.splitlines():
line = raw_line.rstrip()
if line.startswith("## "):
current = line[3:].strip()
sections[current] = []
continue
if current is not None:
sections[current].append(line)
return sections
def _clean_bullet(line: str) -> str | None:
stripped = line.strip()
if not stripped:
return None
stripped = re.sub(r"^-\s*\[(?: |x|X)\]\s*", "", stripped)
stripped = re.sub(r"^-\s*", "", stripped)
return stripped.strip() or None
def _extract_bullets(lines: Iterable[str]) -> list[str]:
items: list[str] = []
for line in lines:
cleaned = _clean_bullet(line)
if cleaned:
items.append(cleaned)
return items
def _extract_parent_issue(body: str, sections: dict[str, list[str]]) -> int | None:
parent_lines = sections.get("Parent", [])
for line in parent_lines:
match = re.search(r"#(\d+)", line)
if match:
return int(match.group(1))
match = re.search(r"Linked to Epic\s+#(\d+)", body, flags=re.IGNORECASE)
if match:
return int(match.group(1))
return None
def _extract_commits(lines: Iterable[str]) -> list[CommitEvidence]:
commits: list[CommitEvidence] = []
for item in _extract_bullets(lines):
match = re.match(r"`([^`]+)`\s*(.*)", item)
if match:
commits.append(CommitEvidence(sha=match.group(1).strip(), summary=match.group(2).strip()))
else:
commits.append(CommitEvidence(sha="", summary=item))
return commits
def _strip_backticks(items: Iterable[str]) -> list[str]:
cleaned: list[str] = []
for item in items:
cleaned.append(item.replace("`", "").strip())
return cleaned
def discover_child_issue_numbers(epic_body: str) -> list[int]:
"""Discover sub-issue numbers from an epic body."""
sections = _parse_sections(epic_body)
sub_lines = sections.get("Sub-issues")
if not sub_lines:
return []
numbers: list[int] = []
seen: set[int] = set()
for line in sub_lines:
for match in re.finditer(r"#(\d+)", line):
number = int(match.group(1))
if number not in seen:
numbers.append(number)
seen.add(number)
return numbers
def parse_child_issue(issue: dict) -> ReviewIssue:
body = issue.get("body") or ""
sections = _parse_sections(body)
commit_lines = sections.get("Commits landed today", []) or sections.get("Commit landed today", [])
return ReviewIssue(
number=int(issue["number"]),
title=issue.get("title") or "",
state=(issue.get("state") or "unknown").lower(),
url=issue.get("html_url") or issue.get("url") or "",
comments=int(issue.get("comments") or 0),
parent_issue=_extract_parent_issue(body, sections),
checkout_notes=_extract_bullets(sections.get("Branch / checkout", [])),
commits=_extract_commits(commit_lines),
targeted_tests=_strip_backticks(_extract_bullets(sections.get("Targeted tests", []))),
files_touched=_strip_backticks(_extract_bullets(sections.get("Files touched", []))),
tasks=_extract_bullets(sections.get("Tasks", [])),
acceptance_criteria=_extract_bullets(sections.get("Acceptance Criteria", [])),
)
def build_packet_markdown(epic_issue: dict, child_issues: list[ReviewIssue]) -> str:
title = epic_issue.get("title") or f"Epic #{epic_issue.get('number')}"
url = epic_issue.get("html_url") or epic_issue.get("url") or ""
body = epic_issue.get("body") or ""
children = sorted(child_issues, key=lambda item: item.number)
lines: list[str] = []
lines.append("# Morning Review Packet")
lines.append("")
lines.append(f"Source epic: [{title}]({url})")
lines.append("")
lines.append("## Epic context")
lines.append("")
lines.append(title)
lines.append("")
for line in body.splitlines():
if line.strip():
lines.append(line)
else:
lines.append("")
lines.append("")
lines.append("## Summary")
lines.append("")
lines.append("| Issue | State | Commits | Tests |")
lines.append("| --- | --- | --- | --- |")
for child in children:
lines.append(
f"| #{child.number} | {child.state} | {len(child.commits)} | {len(child.targeted_tests)} |"
)
lines.append("")
for child in children:
lines.append(f"## #{child.number}{child.title}")
lines.append("")
lines.append(f"State: {child.state}")
lines.append(f"URL: {child.url}")
lines.append("")
if child.checkout_notes:
lines.append("### Branch / checkout")
for note in child.checkout_notes:
lines.append(f"- {note}")
lines.append("")
if child.commits:
lines.append("### Commits")
for commit in child.commits:
if commit.sha:
lines.append(f"- `{commit.sha}` — {commit.summary}")
else:
lines.append(f"- {commit.summary}")
lines.append("")
if child.targeted_tests:
lines.append("### Targeted tests")
for test_path in child.targeted_tests:
lines.append(f"- `{test_path}`")
lines.append("")
if child.files_touched:
lines.append("### Files touched")
for file_path in child.files_touched:
lines.append(f"- `{file_path}`")
lines.append("")
if child.tasks:
lines.append("### Tasks")
for task in child.tasks:
lines.append(f"- [ ] {task}")
lines.append("")
if child.acceptance_criteria:
lines.append("### Acceptance criteria")
for item in child.acceptance_criteria:
lines.append(f"- [ ] {item}")
lines.append("")
return "\n".join(lines).rstrip() + "\n"
def _resolve_token(explicit_token: str | None = None) -> str:
if explicit_token:
return explicit_token.strip()
env_token = os.getenv("GITEA_TOKEN")
if env_token:
return env_token.strip()
if DEFAULT_TOKEN_PATH.exists():
return DEFAULT_TOKEN_PATH.read_text().strip()
raise FileNotFoundError(f"No Gitea token found. Set GITEA_TOKEN or create {DEFAULT_TOKEN_PATH}")
def fetch_issue(base_url: str, owner: str, repo: str, number: int, token: str) -> dict:
url = f"{base_url.rstrip('/')}/api/v1/repos/{owner}/{repo}/issues/{number}"
request = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
with urllib.request.urlopen(request, timeout=30) as response:
return json.loads(response.read().decode())
def collect_child_issues(base_url: str, owner: str, repo: str, epic_issue: dict, token: str, children_spec: str | None = None) -> list[dict]:
numbers = parse_issue_number_spec(children_spec) if children_spec else discover_child_issue_numbers(epic_issue.get("body") or "")
return [fetch_issue(base_url, owner, repo, number, token) for number in numbers]
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Build a markdown morning review packet from a Gitea epic")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument("--owner", default=DEFAULT_OWNER)
parser.add_argument("--repo", default=DEFAULT_REPO)
parser.add_argument("--epic-number", type=int, required=True)
parser.add_argument("--children", help="Explicit issue list/ranges, e.g. 950-962")
parser.add_argument("--token", help="Gitea token (defaults to GITEA_TOKEN or ~/.config/gitea/token)")
parser.add_argument("--output", help="Write markdown packet to this path instead of stdout")
args = parser.parse_args(argv)
token = _resolve_token(args.token)
epic_issue = fetch_issue(args.base_url, args.owner, args.repo, args.epic_number, token)
child_issue_dicts = collect_child_issues(args.base_url, args.owner, args.repo, epic_issue, token, args.children)
packet = build_packet_markdown(epic_issue, [parse_child_issue(issue) for issue in child_issue_dicts])
if args.output:
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(packet)
else:
print(packet, end="")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,162 +0,0 @@
from pathlib import Path
import sys
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
sys.path.insert(0, str(SCRIPT_DIR))
import morning_review_packet as mrp
EPIC_BODY = """Source: git log on upstream/main since 2026-04-21 00:00 EDT.
## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
## Sub-issues
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [x] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
"""
CHILD_BODY_PLURAL = """## Parent
#949
## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
## Commits landed today
- `b11753879` attribution default_headers for ai-gateway provider
- `700437440` curated picker with live pricing
## Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
## Tasks
- [ ] Verify the picker ordering.
- [ ] Verify attribution headers.
## Acceptance Criteria
- [ ] Picker shows AI Gateway prominently.
- [ ] Headers appear on OpenRouter calls.
"""
CHILD_BODY_SINGULAR = """## Parent
#949
## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
## Commit landed today
- `fc21c1420` add buttons to update Hermes and restart gateway
## Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
## Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment.
"""
def test_discover_child_issue_numbers_from_epic_body():
assert mrp.discover_child_issue_numbers(EPIC_BODY) == [950, 951, 962]
def test_parse_issue_number_spec_supports_ranges_and_lists():
assert mrp.parse_issue_number_spec("950-952,955,962") == [950, 951, 952, 955, 962]
def test_parse_child_issue_extracts_structured_sections():
issue = {
"number": 950,
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
"state": "open",
"html_url": "https://forge.example/950",
"comments": 0,
"body": CHILD_BODY_PLURAL,
}
parsed = mrp.parse_child_issue(issue)
assert parsed.number == 950
assert parsed.parent_issue == 949
assert parsed.checkout_notes == ["Validate on `upstream/main` or an equivalent synced checkout."]
assert [c.sha for c in parsed.commits] == ["b11753879", "700437440"]
assert parsed.targeted_tests == [
"tests/hermes_cli/test_ai_gateway_models.py",
"tests/run_agent/test_provider_attribution_headers.py",
]
assert parsed.tasks == [
"Verify the picker ordering.",
"Verify attribution headers.",
]
assert parsed.acceptance_criteria == [
"Picker shows AI Gateway prominently.",
"Headers appear on OpenRouter calls.",
]
def test_parse_child_issue_handles_singular_commit_heading_and_files_touched():
issue = {
"number": 961,
"title": "[QA] Verify web dashboard update/restart action buttons",
"state": "closed",
"html_url": "https://forge.example/961",
"comments": 16,
"body": CHILD_BODY_SINGULAR,
}
parsed = mrp.parse_child_issue(issue)
assert [c.sha for c in parsed.commits] == ["fc21c1420"]
assert parsed.files_touched == [
"web/src/pages/StatusPage.tsx",
"web/src/lib/api.ts",
"web/src/i18n/en.ts",
]
assert parsed.tasks == [
"Open the Web UI status page and verify both buttons are present.",
"Click Restart Gateway in a safe environment.",
]
def test_build_packet_markdown_renders_summary_and_details():
epic_issue = {
"number": 949,
"title": "EPIC: Morning review packet — Hermes harness features landed 2026-04-21",
"state": "open",
"html_url": "https://forge.example/949",
"body": EPIC_BODY,
}
child_a = mrp.parse_child_issue({
"number": 950,
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
"state": "open",
"html_url": "https://forge.example/950",
"comments": 0,
"body": CHILD_BODY_PLURAL,
})
child_b = mrp.parse_child_issue({
"number": 961,
"title": "[QA] Verify web dashboard update/restart action buttons",
"state": "closed",
"html_url": "https://forge.example/961",
"comments": 16,
"body": CHILD_BODY_SINGULAR,
})
markdown = mrp.build_packet_markdown(epic_issue, [child_a, child_b])
assert "# Morning Review Packet" in markdown
assert "EPIC: Morning review packet — Hermes harness features landed 2026-04-21" in markdown
assert "| #950 | open | 2 | 2 |" in markdown
assert "| #961 | closed | 1 | 0 |" in markdown
assert "## #950 — [QA] Verify AI Gateway provider UX + attribution headers" in markdown
assert "## #961 — [QA] Verify web dashboard update/restart action buttons" in markdown
assert "`b11753879` — attribution default_headers for ai-gateway provider" in markdown
assert "`web/src/pages/StatusPage.tsx`" in markdown

View File

@@ -199,7 +199,7 @@ class TestMarkdown:
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4
assert len(dataset) >= 50
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
@@ -216,6 +216,9 @@ class TestDataset:
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
assert "chart" in categories
assert "ocr" in categories
assert "document" in categories
class TestModels:

View File

@@ -0,0 +1,21 @@
import json
from pathlib import Path
DATASET = Path("benchmarks/test_images.json")
REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
items = json.loads(DATASET.read_text(encoding="utf-8"))
assert len(items) >= 50
categories = {item["category"] for item in items}
assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
def test_metrics_report_exists_with_recommendation() -> None:
assert REPORT.exists(), "missing benchmark report under metrics/"
text = REPORT.read_text(encoding="utf-8")
assert "Recommendation" in text
assert "Gemma 4" in text
assert "Gemini" in text