Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
9d05f77a9b feat: harden vision benchmark artifacts
All checks were successful
Lint / lint (pull_request) Successful in 9s
Refs #817
2026-04-22 12:22:28 -04:00
Alexander Whitestone
23e093fc75 wip: tighten vision benchmark acceptance tests 2026-04-22 12:10:23 -04:00
Alexander Whitestone
f77ce4dff2 wip: add regression tests for vision benchmark artifacts 2026-04-22 12:07:52 -04:00
10 changed files with 1137 additions and 2068 deletions

View File

@@ -1,194 +1,757 @@
[
{
"id": "screenshot_github_home",
"id": "screenshot_github_mark",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"id": "screenshot_github_social",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_github_code_search",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_terminal_capture",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_http_404",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_01",
"url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
"category": "screenshot",
"expected_keywords": [
"hermes",
"cli",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_02",
"url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
"category": "screenshot",
"expected_keywords": [
"prompt",
"cache",
"dashboard"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_01",
"url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
"category": "screenshot",
"expected_keywords": [
"settings",
"voice",
"mode"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_02",
"url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
"category": "screenshot",
"expected_keywords": [
"browser",
"vision",
"preview"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_03",
"url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
"category": "screenshot",
"expected_keywords": [
"tool",
"call",
"inspector"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_flow_a",
"url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flowchart",
"api",
"worker"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_flow_b",
"url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
"category": "diagram",
"expected_keywords": [
"architecture",
"diagram",
"cache"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"id": "diagram_uml_a",
"url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
"category": "diagram",
"expected_keywords": [
"class",
"diagram",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"id": "diagram_uml_b",
"url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
"category": "diagram",
"expected_keywords": [
"sequence",
"diagram",
"response"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"id": "diagram_network_a",
"url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
"category": "diagram",
"expected_keywords": [
"network",
"node",
"router"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_network_b",
"url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
"category": "diagram",
"expected_keywords": [
"service",
"mesh",
"auth"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_state_machine",
"url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
"category": "diagram",
"expected_keywords": [
"state",
"machine",
"idle"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_mind_map",
"url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
"category": "diagram",
"expected_keywords": [
"mind",
"memory",
"tools"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_pipeline",
"url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
"category": "diagram",
"expected_keywords": [
"pipeline",
"ingest",
"summarize"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"org",
"chart",
"review"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"id": "photo_random_01",
"url": "https://picsum.photos/seed/vision-bench-1/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"id": "photo_random_02",
"url": "https://picsum.photos/seed/vision-bench-2/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_03",
"url": "https://picsum.photos/seed/vision-bench-3/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_04",
"url": "https://picsum.photos/seed/vision-bench-4/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_05",
"url": "https://picsum.photos/seed/vision-bench-5/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_06",
"url": "https://picsum.photos/seed/vision-bench-6/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_07",
"url": "https://picsum.photos/seed/vision-bench-7/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_08",
"url": "https://picsum.photos/seed/vision-bench-8/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_09",
"url": "https://picsum.photos/seed/vision-bench-9/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_10",
"url": "https://picsum.photos/seed/vision-bench-10/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_bar_quarterly",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie_market",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_line_temp",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"id": "chart_radar_skill",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"id": "chart_stacked_cloud",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"id": "chart_area_growth",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
"category": "chart",
"expected_keywords": [
"line",
"growth",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"id": "chart_scatter_eval",
"url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
"category": "chart",
"expected_keywords": [
"scatter",
"chart",
"runs"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_horizontal_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
"category": "chart",
"expected_keywords": [
"bar",
"score",
"ocr"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_bubble_usage",
"url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
"category": "chart",
"expected_keywords": [
"bubble",
"latency",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_doughnut_devices",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "ocr_text_01",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
"category": "ocr",
"expected_keywords": [
"hermes",
"ocr"
],
"ground_truth_ocr": "Hermes OCR Alpha 01",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_02",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
"category": "ocr",
"expected_keywords": [
"prompt",
"cache"
],
"ground_truth_ocr": "Prompt Cache Hit 87%",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_03",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
"category": "ocr",
"expected_keywords": [
"session",
"42"
],
"ground_truth_ocr": "Session 42 Ready",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_04",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
"category": "ocr",
"expected_keywords": [
"latency",
"118"
],
"ground_truth_ocr": "Latency 118 ms",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "ocr_text_05",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
"category": "ocr",
"expected_keywords": [
"voice",
"mode"
],
"ground_truth_ocr": "Voice Mode Enabled",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_01",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
"category": "document",
"expected_keywords": [
"invoice",
"1001",
"total"
],
"ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_02",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
"category": "document",
"expected_keywords": [
"form",
"a",
"name"
],
"ground_truth_ocr": "Form A Name Alice Status Approved",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_03",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
"category": "document",
"expected_keywords": [
"report",
"memory",
"recall"
],
"ground_truth_ocr": "Report Memory Recall Score 91 Percent",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_04",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
"category": "document",
"expected_keywords": [
"checklist",
"crisis",
"escalation"
],
"ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_05",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
"category": "document",
"expected_keywords": [
"meeting",
"notes",
"vision"
],
"ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
}
]
]

View File

@@ -22,10 +22,12 @@ import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -41,12 +43,16 @@ MODELS = {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"fallback_provider": "ollama",
"fallback_model_id": "gemma4:latest",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"fallback_provider": "gemini",
"fallback_model_id": "gemini-2.5-flash",
"description": "Current default vision model",
},
}
@@ -84,91 +90,150 @@ async def analyze_with_model(
"""
import httpx
def _load_image_bytes_cached() -> tuple[bytes, str]:
nonlocal _image_bytes, _mime_type
if _image_bytes is not None:
return _image_bytes, _mime_type
if image_url.startswith(("http://", "https://")):
with urllib.request.urlopen(image_url, timeout=30) as resp:
_image_bytes = resp.read()
_mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
else:
path = Path(image_url).expanduser()
_image_bytes = path.read_bytes()
_mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
return _image_bytes, _mime_type
def _data_url() -> str:
image_bytes, mime_type = _load_image_bytes_cached()
return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
def _provider_key(provider: str) -> str:
if provider == "openrouter":
return os.getenv("OPENROUTER_API_KEY", "")
if provider == "nous":
return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
if provider == "gemini":
return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
return os.getenv(f"{provider.upper()}_API_KEY", "")
provider = model_config["provider"]
model_id = model_config["model_id"]
candidates = [(provider, model_id)]
if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
_image_bytes: Optional[bytes] = None
_mime_type = "image/png"
failures = []
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
for candidate_provider, candidate_model in candidates:
api_key = _provider_key(candidate_provider)
start = time.perf_counter()
try:
if candidate_provider in {"openrouter", "nous"}:
api_url = (
"https://openrouter.ai/api/v1/chat/completions"
if candidate_provider == "openrouter"
else "https://inference.nousresearch.com/v1/chat/completions"
)
if not api_key:
raise RuntimeError(f"No API key for provider {candidate_provider}")
payload = {
"model": candidate_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
],
}],
"max_tokens": 2000,
"temperature": 0.1,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
elif candidate_provider == "gemini":
if not api_key:
raise RuntimeError("No API key for provider gemini")
image_bytes, mime_type = _load_image_bytes_cached()
api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
payload = {
"contents": [{"parts": [
{"text": prompt},
{"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
]}],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload)
resp.raise_for_status()
data = resp.json()
parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
usage = data.get("usageMetadata", {})
tokens = {
"prompt_tokens": usage.get("promptTokenCount", 0),
"completion_tokens": usage.get("candidatesTokenCount", 0),
"total_tokens": usage.get("totalTokenCount", 0),
}
elif candidate_provider == "ollama":
image_bytes, _ = _load_image_bytes_cached()
payload = {
"model": candidate_model,
"stream": False,
"messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
"options": {"temperature": 0.1},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
analysis = data.get("message", {}).get("content", "")
tokens = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
}
else:
raise RuntimeError(f"Unsupported provider {candidate_provider}")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
latency_ms = (time.perf_counter() - start) * 1000
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
"provider_used": candidate_provider,
"model_used": candidate_model,
}
except Exception as e:
failures.append(f"{candidate_provider}:{candidate_model} => {e}")
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": " | ".join(failures) if failures else "No runs",
"provider_used": candidates[-1][0] if candidates else provider,
"model_used": candidates[-1][1] if candidates else model_id,
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
@@ -398,7 +463,13 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
summary[model_name] = {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
@@ -410,6 +481,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
@@ -495,6 +567,23 @@ def to_markdown(report: dict) -> str:
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
lines += ["", "## Failure Modes", ""]
had_failures = False
for mkey, mname in config["models"].items():
model_summary = summary.get(mkey, {})
failure_examples = model_summary.get("failure_examples", [])
if not failure_examples and not model_summary.get("error"):
continue
had_failures = True
lines.append(f"### {mname}")
if model_summary.get("error"):
lines.append(f"- Summary: {model_summary['error']}")
for err in failure_examples:
lines.append(f"- {err}")
lines.append("")
if not had_failures:
lines.append("- No provider/runtime failures recorded.")
# Verdict
lines += ["", "## Verdict", ""]
@@ -516,8 +605,12 @@ def to_markdown(report: dict) -> str:
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
lines.append("")
lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
else:
lines.append("No clear winner — insufficient data.")
lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
lines.append("")
lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")
return "\n".join(lines)
@@ -528,44 +621,124 @@ def to_markdown(report: dict) -> str:
def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images.
"""Generate a larger benchmark dataset aligned with issue #817.
Returns list of test image definitions.
Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
and document-like images so the harness matches the issue contract.
"""
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
dataset: List[dict] = []
screenshots = [
("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
]
for ident, url, keywords in screenshots:
dataset.append({
"id": f"screenshot_{ident}",
"url": url,
"category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
diagrams = [
("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
]
for ident, url, keywords in diagrams:
dataset.append({
"id": f"diagram_{ident}",
"url": url,
"category": "diagram",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
})
for idx in range(1, 11):
dataset.append({
"id": f"photo_random_{idx:02d}",
"url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
charts = [
("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
]
for ident, url, keywords in charts:
dataset.append({
"id": f"chart_{ident}",
"url": url,
"category": "chart",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
})
ocr_texts = [
"Hermes OCR Alpha 01",
"Prompt Cache Hit 87%",
"Session 42 Ready",
"Latency 118 ms",
"Voice Mode Enabled",
]
for idx, text in enumerate(ocr_texts, start=1):
dataset.append({
"id": f"ocr_text_{idx:02d}",
"url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
"category": "ocr",
"expected_keywords": text.lower().split()[:2],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
documents = [
"Invoice 1001 Total 42 Due 2026-04-22",
"Form A Name Alice Status Approved",
"Report Memory Recall Score 91 Percent",
"Checklist Crisis Escalation Call 988 Now",
"Meeting Notes Vision Benchmark Run Pending",
]
for idx, text in enumerate(documents, start=1):
dataset.append({
"id": f"document_text_{idx:02d}",
"url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
"category": "document",
"expected_keywords": text.lower().split()[:3],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
return dataset
def load_dataset(path: str) -> List[dict]:
@@ -585,7 +758,9 @@ async def main():
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -617,9 +792,14 @@ async def main():
print("ERROR: Provide --images or --url")
sys.exit(1)
if args.limit and args.limit > 0:
images = images[:args.limit]
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
markdown_report = to_markdown(report)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -627,8 +807,14 @@ async def main():
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown_output:
os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
with open(args.markdown_output, "w", encoding="utf-8") as f:
f.write(markdown_report)
print(f"Markdown report saved to {args.markdown_output}")
if args.markdown or not args.output:
print("\n" + to_markdown(report))
print("\n" + markdown_report)
if __name__ == "__main__":

File diff suppressed because it is too large Load Diff

View File

@@ -1,217 +0,0 @@
# TensorZero Evaluation Packet
Issue #860: [tensorzero LLMOps platform evaluation](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860)
## Scope
This packet evaluates TensorZero as a possible replacement for Hermes' custom provider-routing stack.
It is intentionally grounded in the current repo state rather than a speculative cutover plan.
## Issue requirements being evaluated
- Deploy tensorzero gateway (Rust binary)
- Migrate provider routing config
- Test with canary (10% traffic) before full cutover
- Feed session data for prompt optimization
- Evaluation suite for A/B testing models
## Recommendation
Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, and only design a canary gateway once percentage-based rollout controls exist.
## Requirement matrix
| Requirement | Status | Evidence labels | Summary |
| --- | --- | --- | --- |
| Gateway replacement scope | partial | fallback_chain, runtime_provider, gateway_provider_routing, cron_runtime_provider, auxiliary_fallback_chain, delegate_runtime_provider | Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; TensorZero would need parity across all of them before it can replace the gateway layer. |
| Config migration | partial | provider_routing_config, runtime_provider, smart_model_routing, fallback_chain | Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), so TensorZero is not a drop-in config swap. |
| 10% traffic canary | gap | — | The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. A TensorZero cutover would need new percentage-based rollout controls and observability hooks. |
| Session data for prompt optimization | partial | session_db, trajectory_export | Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, but not a TensorZero-native ingestion path yet. |
| Evaluation suite / A/B testing | partial | benchmark_suite, trajectory_export | Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, but no integrated TensorZero experiment runner or live evaluation gateway. |
## Grounded Hermes touchpoints
- `run_agent.py:601` — [fallback_chain] fallback_model: Dict[str, Any] = None,
- `run_agent.py:995` — [fallback_chain] # failure). Supports both legacy single-dict ``fallback_model`` and
- `run_agent.py:996` — [fallback_chain] # new list ``fallback_providers`` format.
- `run_agent.py:997` — [fallback_chain] if isinstance(fallback_model, list):
- `run_agent.py:998` — [fallback_chain] self._fallback_chain = [
- `run_agent.py:999` — [fallback_chain] f for f in fallback_model
- `run_agent.py:1002` — [fallback_chain] elif isinstance(fallback_model, dict) and fallback_model.get("provider") and fallback_model.get("model"):
- `run_agent.py:1003` — [fallback_chain] self._fallback_chain = [fallback_model]
- `run_agent.py:1005` — [fallback_chain] self._fallback_chain = []
- `run_agent.py:1009` — [fallback_chain] self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None
- `run_agent.py:1010` — [fallback_chain] if self._fallback_chain and not self.quiet_mode:
- `run_agent.py:1011` — [fallback_chain] if len(self._fallback_chain) == 1:
- `run_agent.py:1012` — [fallback_chain] fb = self._fallback_chain[0]
- `run_agent.py:1015` — [fallback_chain] print(f"🔄 Fallback chain ({len(self._fallback_chain)} providers): " +
- `run_agent.py:1016` — [fallback_chain] " → ".join(f"{f['model']} ({f['provider']})" for f in self._fallback_chain))
- `run_agent.py:5624` — [fallback_chain] if self._fallback_index >= len(self._fallback_chain):
- `run_agent.py:5627` — [fallback_chain] fb = self._fallback_chain[self._fallback_index]
- `run_agent.py:8559` — [fallback_chain] if self._fallback_index < len(self._fallback_chain):
- `run_agent.py:9355` — [fallback_chain] if is_rate_limited and self._fallback_index < len(self._fallback_chain):
- `run_agent.py:10460` — [fallback_chain] if _truly_empty and self._fallback_chain:
- `run_agent.py:10514` — [fallback_chain] + (" and fallback attempts." if self._fallback_chain else
- `cli.py:241` — [provider_routing_config] "smart_model_routing": {
- `cli.py:370` — [provider_routing_config] # (e.g. platform_toolsets, provider_routing, memory, honcho, etc.)
- `cli.py:1753` — [provider_routing_config] pr = CLI_CONFIG.get("provider_routing", {}) or {}
- `cli.py:1762` — [provider_routing_config] # Supports new list format (fallback_providers) and legacy single-dict (fallback_model).
- `cli.py:1763` — [provider_routing_config] fb = CLI_CONFIG.get("fallback_providers") or CLI_CONFIG.get("fallback_model") or []
- `cli.py:1770` — [provider_routing_config] self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
- `cli.py:2771` — [provider_routing_config] from agent.smart_model_routing import resolve_turn_route
- `cli.py:2776` — [provider_routing_config] self._smart_model_routing,
- `hermes_cli/runtime_provider.py:209` — [runtime_provider] def resolve_requested_provider(requested: Optional[str] = None) -> str:
- `hermes_cli/runtime_provider.py:649` — [runtime_provider] def resolve_runtime_provider(
- `agent/smart_model_routing.py:62` — [smart_model_routing] def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
- `agent/smart_model_routing.py:110` — [smart_model_routing] def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
- `gateway/run.py:1271` — [gateway_provider_routing] def _load_provider_routing() -> dict:
- `gateway/run.py:1285` — [gateway_provider_routing] def _load_fallback_model() -> list | dict | None:
- `gateway/run.py:1306` — [gateway_provider_routing] def _load_smart_model_routing() -> dict:
- `cron/scheduler.py:684` — [cron_runtime_provider] pr = _cfg.get("provider_routing", {})
- `cron/scheduler.py:688` — [cron_runtime_provider] resolve_runtime_provider,
- `cron/scheduler.py:697` — [cron_runtime_provider] runtime = resolve_runtime_provider(**runtime_kwargs)
- `cron/scheduler.py:702` — [cron_runtime_provider] from agent.smart_model_routing import resolve_turn_route
- `cron/scheduler.py:703` — [cron_runtime_provider] turn_route = resolve_turn_route(
- `cron/scheduler.py:717` — [cron_runtime_provider] fallback_model = _cfg.get("fallback_providers") or _cfg.get("fallback_model") or None
- `cron/scheduler.py:746` — [cron_runtime_provider] fallback_model=fallback_model,
- `agent/auxiliary_client.py:1018` — [auxiliary_fallback_chain] def _get_provider_chain() -> List[tuple]:
- `agent/auxiliary_client.py:1107` — [auxiliary_fallback_chain] for label, try_fn in _get_provider_chain():
- `agent/auxiliary_client.py:1189` — [auxiliary_fallback_chain] # ── Step 2: aggregator / fallback chain ──────────────────────────────
- `agent/auxiliary_client.py:1191` — [auxiliary_fallback_chain] for label, try_fn in _get_provider_chain():
- `agent/auxiliary_client.py:2397` — [auxiliary_fallback_chain] # error, fall through to the fallback chain below.
- `agent/auxiliary_client.py:2417` — [auxiliary_fallback_chain] # auto (the default) = best-effort fallback chain. (#7559)
- `agent/auxiliary_client.py:2589` — [auxiliary_fallback_chain] # error, fall through to the fallback chain below.
- `tools/delegate_tool.py:662` — [delegate_runtime_provider] # bundle (base_url, api_key, api_mode) via the same runtime provider system
- `tools/delegate_tool.py:854` — [delegate_runtime_provider] provider) is resolved via the runtime provider system — the same path used
- `tools/delegate_tool.py:909` — [delegate_runtime_provider] from hermes_cli.runtime_provider import resolve_runtime_provider
- `tools/delegate_tool.py:910` — [delegate_runtime_provider] runtime = resolve_runtime_provider(requested=configured_provider)
- `hermes_state.py:115` — [session_db] class SessionDB:
- `batch_runner.py:320` — [trajectory_export] save_trajectories=False, # We handle saving ourselves
- `batch_runner.py:346` — [trajectory_export] trajectory = agent._convert_to_trajectory_format(
- `batch_runner.py:460` — [trajectory_export] trajectory_entry = {
- `batch_runner.py:474` — [trajectory_export] f.write(json.dumps(trajectory_entry, ensure_ascii=False) + "\n")
- `benchmarks/tool_call_benchmark.py:3` — [benchmark_suite] Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
- `benchmarks/tool_call_benchmark.py:9` — [benchmark_suite] python3 benchmarks/tool_call_benchmark.py # full 100-call suite
- `benchmarks/tool_call_benchmark.py:10` — [benchmark_suite] python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
- `benchmarks/tool_call_benchmark.py:11` — [benchmark_suite] python3 benchmarks/tool_call_benchmark.py --models nous # single model
- `benchmarks/tool_call_benchmark.py:12` — [benchmark_suite] python3 benchmarks/tool_call_benchmark.py --category file # single category
- `benchmarks/tool_call_benchmark.py:37` — [benchmark_suite] class ToolCall:
- `benchmarks/tool_call_benchmark.py:51` — [benchmark_suite] ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
- `benchmarks/tool_call_benchmark.py:53` — [benchmark_suite] ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
- `benchmarks/tool_call_benchmark.py:55` — [benchmark_suite] ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
- `benchmarks/tool_call_benchmark.py:57` — [benchmark_suite] ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
- `benchmarks/tool_call_benchmark.py:59` — [benchmark_suite] ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
- `benchmarks/tool_call_benchmark.py:61` — [benchmark_suite] ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
- `benchmarks/tool_call_benchmark.py:63` — [benchmark_suite] ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
- `benchmarks/tool_call_benchmark.py:65` — [benchmark_suite] ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
- `benchmarks/tool_call_benchmark.py:67` — [benchmark_suite] ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
- `benchmarks/tool_call_benchmark.py:69` — [benchmark_suite] ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
- `benchmarks/tool_call_benchmark.py:71` — [benchmark_suite] ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
- `benchmarks/tool_call_benchmark.py:73` — [benchmark_suite] ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
- `benchmarks/tool_call_benchmark.py:75` — [benchmark_suite] ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
- `benchmarks/tool_call_benchmark.py:77` — [benchmark_suite] ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
- `benchmarks/tool_call_benchmark.py:78` — [benchmark_suite] "patch", "Tool Benchmark"),
- `benchmarks/tool_call_benchmark.py:79` — [benchmark_suite] ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
- `benchmarks/tool_call_benchmark.py:81` — [benchmark_suite] ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
- `benchmarks/tool_call_benchmark.py:83` — [benchmark_suite] ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
- `benchmarks/tool_call_benchmark.py:85` — [benchmark_suite] ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
- `benchmarks/tool_call_benchmark.py:87` — [benchmark_suite] ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
- `benchmarks/tool_call_benchmark.py:89` — [benchmark_suite] ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
- `benchmarks/tool_call_benchmark.py:93` — [benchmark_suite] ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
- `benchmarks/tool_call_benchmark.py:95` — [benchmark_suite] ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
- `benchmarks/tool_call_benchmark.py:97` — [benchmark_suite] ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
- `benchmarks/tool_call_benchmark.py:99` — [benchmark_suite] ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
- `benchmarks/tool_call_benchmark.py:101` — [benchmark_suite] ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
- `benchmarks/tool_call_benchmark.py:103` — [benchmark_suite] ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
- `benchmarks/tool_call_benchmark.py:105` — [benchmark_suite] ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
- `benchmarks/tool_call_benchmark.py:107` — [benchmark_suite] ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
- `benchmarks/tool_call_benchmark.py:109` — [benchmark_suite] ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
- `benchmarks/tool_call_benchmark.py:111` — [benchmark_suite] ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
- `benchmarks/tool_call_benchmark.py:113` — [benchmark_suite] ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
- `benchmarks/tool_call_benchmark.py:115` — [benchmark_suite] ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
- `benchmarks/tool_call_benchmark.py:117` — [benchmark_suite] ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
- `benchmarks/tool_call_benchmark.py:119` — [benchmark_suite] ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
- `benchmarks/tool_call_benchmark.py:121` — [benchmark_suite] ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
- `benchmarks/tool_call_benchmark.py:123` — [benchmark_suite] ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
- `benchmarks/tool_call_benchmark.py:125` — [benchmark_suite] ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
- `benchmarks/tool_call_benchmark.py:127` — [benchmark_suite] ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
- `benchmarks/tool_call_benchmark.py:129` — [benchmark_suite] ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
- `benchmarks/tool_call_benchmark.py:131` — [benchmark_suite] ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
- `benchmarks/tool_call_benchmark.py:135` — [benchmark_suite] ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
- `benchmarks/tool_call_benchmark.py:137` — [benchmark_suite] ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
- `benchmarks/tool_call_benchmark.py:139` — [benchmark_suite] ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
- `benchmarks/tool_call_benchmark.py:141` — [benchmark_suite] ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
- `benchmarks/tool_call_benchmark.py:143` — [benchmark_suite] ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
- `benchmarks/tool_call_benchmark.py:145` — [benchmark_suite] ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
- `benchmarks/tool_call_benchmark.py:147` — [benchmark_suite] ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
- `benchmarks/tool_call_benchmark.py:149` — [benchmark_suite] ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
- `benchmarks/tool_call_benchmark.py:151` — [benchmark_suite] ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
- `benchmarks/tool_call_benchmark.py:153` — [benchmark_suite] ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
- `benchmarks/tool_call_benchmark.py:155` — [benchmark_suite] ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
- `benchmarks/tool_call_benchmark.py:157` — [benchmark_suite] ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
- `benchmarks/tool_call_benchmark.py:159` — [benchmark_suite] ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
- `benchmarks/tool_call_benchmark.py:161` — [benchmark_suite] ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
- `benchmarks/tool_call_benchmark.py:163` — [benchmark_suite] ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
- `benchmarks/tool_call_benchmark.py:167` — [benchmark_suite] ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
- `benchmarks/tool_call_benchmark.py:169` — [benchmark_suite] ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
- `benchmarks/tool_call_benchmark.py:171` — [benchmark_suite] ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
- `benchmarks/tool_call_benchmark.py:173` — [benchmark_suite] ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
- `benchmarks/tool_call_benchmark.py:175` — [benchmark_suite] ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
- `benchmarks/tool_call_benchmark.py:177` — [benchmark_suite] ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
- `benchmarks/tool_call_benchmark.py:179` — [benchmark_suite] ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
- `benchmarks/tool_call_benchmark.py:181` — [benchmark_suite] ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
- `benchmarks/tool_call_benchmark.py:183` — [benchmark_suite] ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
- `benchmarks/tool_call_benchmark.py:185` — [benchmark_suite] ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
- `benchmarks/tool_call_benchmark.py:189` — [benchmark_suite] ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
- `benchmarks/tool_call_benchmark.py:190` — [benchmark_suite] "todo", "benchmark"),
- `benchmarks/tool_call_benchmark.py:191` — [benchmark_suite] ToolCall("todo-02", "todo", "Show me the current todo list.",
- `benchmarks/tool_call_benchmark.py:193` — [benchmark_suite] ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
- `benchmarks/tool_call_benchmark.py:195` — [benchmark_suite] ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
- `benchmarks/tool_call_benchmark.py:197` — [benchmark_suite] ToolCall("todo-05", "todo", "Clear all completed todos.",
- `benchmarks/tool_call_benchmark.py:199` — [benchmark_suite] ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
- `benchmarks/tool_call_benchmark.py:201` — [benchmark_suite] "memory", "benchmark"),
- `benchmarks/tool_call_benchmark.py:202` — [benchmark_suite] ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
- `benchmarks/tool_call_benchmark.py:203` — [benchmark_suite] "memory", "benchmark"),
- `benchmarks/tool_call_benchmark.py:204` — [benchmark_suite] ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
- `benchmarks/tool_call_benchmark.py:206` — [benchmark_suite] ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
- `benchmarks/tool_call_benchmark.py:208` — [benchmark_suite] ToolCall("todo-10", "memory", "Search memory for any notes about models.",
- `benchmarks/tool_call_benchmark.py:212` — [benchmark_suite] ToolCall("skill-01", "skills", "List all available skills.",
- `benchmarks/tool_call_benchmark.py:214` — [benchmark_suite] ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
- `benchmarks/tool_call_benchmark.py:216` — [benchmark_suite] ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
- `benchmarks/tool_call_benchmark.py:218` — [benchmark_suite] ToolCall("skill-04", "skills", "View the 'code-review' skill.",
- `benchmarks/tool_call_benchmark.py:220` — [benchmark_suite] ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
- `benchmarks/tool_call_benchmark.py:222` — [benchmark_suite] ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
- `benchmarks/tool_call_benchmark.py:224` — [benchmark_suite] ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
- `benchmarks/tool_call_benchmark.py:226` — [benchmark_suite] ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
- `benchmarks/tool_call_benchmark.py:228` — [benchmark_suite] ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
- `benchmarks/tool_call_benchmark.py:230` — [benchmark_suite] ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
- `benchmarks/tool_call_benchmark.py:234` — [benchmark_suite] ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
- `benchmarks/tool_call_benchmark.py:236` — [benchmark_suite] ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
- `benchmarks/tool_call_benchmark.py:238` — [benchmark_suite] ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
- `benchmarks/tool_call_benchmark.py:240` — [benchmark_suite] ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
- `benchmarks/tool_call_benchmark.py:242` — [benchmark_suite] ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
- `benchmarks/tool_call_benchmark.py:244` — [benchmark_suite] ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
- `benchmarks/tool_call_benchmark.py:246` — [benchmark_suite] ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
- `benchmarks/tool_call_benchmark.py:248` — [benchmark_suite] ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
- `benchmarks/tool_call_benchmark.py:250` — [benchmark_suite] ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
- `benchmarks/tool_call_benchmark.py:252` — [benchmark_suite] ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
- `benchmarks/tool_call_benchmark.py:254` — [benchmark_suite] ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
- `benchmarks/tool_call_benchmark.py:256` — [benchmark_suite] ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
- `benchmarks/tool_call_benchmark.py:258` — [benchmark_suite] ToolCall("skill-13", "skills", "List all available skill categories.",
- `benchmarks/tool_call_benchmark.py:260` — [benchmark_suite] ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
- `benchmarks/tool_call_benchmark.py:262` — [benchmark_suite] ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
- `benchmarks/tool_call_benchmark.py:311` — [benchmark_suite] """Create prerequisite files for the benchmark."""
- `benchmarks/tool_call_benchmark.py:313` — [benchmark_suite] "This is a benchmark test file.\n"
- `benchmarks/tool_call_benchmark.py:349` — [benchmark_suite] "You are a benchmark test runner. Execute the user's request by calling "
- `benchmarks/tool_call_benchmark.py:406` — [benchmark_suite] """Generate markdown benchmark report."""
- `benchmarks/tool_call_benchmark.py:428` — [benchmark_suite] f"# Tool-Calling Benchmark Report",
- `benchmarks/tool_call_benchmark.py:535` — [benchmark_suite] parser = argparse.ArgumentParser(description="Tool-calling benchmark")
- `benchmarks/tool_call_benchmark.py:544` — [benchmark_suite] help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
- `benchmarks/tool_call_benchmark.py:565` — [benchmark_suite] output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
- `benchmarks/tool_call_benchmark.py:575` — [benchmark_suite] print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
## Suggested next slice
1. Build an exporter that emits SessionDB + trajectory data into a TensorZero-friendly offline dataset.
2. Define percentage-based canary controls before attempting any gateway replacement.
3. Keep Hermes routing authoritative until TensorZero proves parity across CLI, gateway, cron, auxiliary, and delegation surfaces.

View File

@@ -0,0 +1,67 @@
{
"generated_at": "2026-04-22T16:21:56.271426+00:00",
"config": {
"total_images": 2,
"runs_per_model": 1,
"models": {
"gemma4": "Gemma 4 27B",
"gemini3_flash": "Gemini 3 Flash Preview"
}
},
"results": [
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_mark",
"category": "screenshot"
},
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_social",
"category": "screenshot"
}
],
"summary": {
"gemma4": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
]
},
"gemini3_flash": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
]
}
}
}

View File

@@ -0,0 +1,44 @@
# Vision Benchmark Report
Generated: 2026-04-22T16:21
Images tested: 2
Runs per model: 1
Models: Gemma 4 27B, Gemini 3 Flash Preview
## Latency Comparison
| Model | Mean (ms) | Median | P95 | Std Dev |
|-------|-----------|--------|-----|---------|
## Accuracy Comparison
| Model | OCR Accuracy | Keyword Coverage | Success Rate |
|-------|-------------|-----------------|--------------|
## Token Usage
| Model | Mean Tokens/Image | Total Tokens |
|-------|------------------|--------------|
## Failure Modes
### Gemma 4 27B
- Summary: All runs failed
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
### Gemini 3 Flash Preview
- Summary: All runs failed
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
## Verdict
Benchmark blocked or insufficient data for a trustworthy winner.
Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.

View File

@@ -1,318 +0,0 @@
#!/usr/bin/env python3
"""Generate a grounded TensorZero evaluation packet for Hermes.
This script inventories the current Hermes routing/evaluation surfaces, then
builds a markdown packet assessing how much of issue #860 can be satisfied by
TensorZero and where the migration risk still lives.
"""
from __future__ import annotations
import argparse
import json
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable
ISSUE_NUMBER = 860
ISSUE_TITLE = "tensorzero LLMOps platform evaluation"
ISSUE_URL = "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860"
DEFAULT_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.md")
DEFAULT_JSON_OUTPUT = Path("docs/evaluations/tensorzero-860-evaluation.json")
@dataclass(frozen=True)
class TouchpointPattern:
label: str
file_path: str
regex: str
description: str
@dataclass(frozen=True)
class Touchpoint:
label: str
file_path: str
line_number: int
matched_text: str
@dataclass(frozen=True)
class RequirementStatus:
key: str
name: str
status: str
evidence_labels: tuple[str, ...]
summary: str
@dataclass(frozen=True)
class EvaluationReport:
issue_number: int
issue_title: str
issue_url: str
recommendation: str
touchpoints: tuple[Touchpoint, ...]
requirements: tuple[RequirementStatus, ...]
PATTERNS: tuple[TouchpointPattern, ...] = (
TouchpointPattern(
label="fallback_chain",
file_path="run_agent.py",
regex=r"_fallback_chain|fallback_providers|fallback_model",
description="Primary agent fallback-provider chain in the core conversation loop.",
),
TouchpointPattern(
label="provider_routing_config",
file_path="cli.py",
regex=r"provider_routing|fallback_providers|smart_model_routing",
description="CLI-owned provider routing and fallback configuration surfaces.",
),
TouchpointPattern(
label="runtime_provider",
file_path="hermes_cli/runtime_provider.py",
regex=r"def resolve_runtime_provider|def resolve_requested_provider",
description="Central runtime provider resolution for CLI, gateway, cron, and helpers.",
),
TouchpointPattern(
label="smart_model_routing",
file_path="agent/smart_model_routing.py",
regex=r"def resolve_turn_route|def choose_cheap_model_route",
description="Cheap-vs-strong turn routing that TensorZero would need to absorb or replace.",
),
TouchpointPattern(
label="gateway_provider_routing",
file_path="gateway/run.py",
regex=r"def _load_provider_routing|def _load_fallback_model|def _load_smart_model_routing",
description="Gateway-specific loading of routing, fallback, and smart-model policies.",
),
TouchpointPattern(
label="cron_runtime_provider",
file_path="cron/scheduler.py",
regex=r"resolve_runtime_provider|resolve_turn_route|provider_routing|fallback_model",
description="Cron execution path that re-resolves providers and routing on every run.",
),
TouchpointPattern(
label="auxiliary_fallback_chain",
file_path="agent/auxiliary_client.py",
regex=r"fallback chain|_get_provider_chain|provider chain",
description="Auxiliary task routing/fallback chain outside the main inference path.",
),
TouchpointPattern(
label="delegate_runtime_provider",
file_path="tools/delegate_tool.py",
regex=r"runtime provider system|resolve the full credential bundle|resolve_runtime_provider",
description="Subagent/delegation routing path that would also need TensorZero parity.",
),
TouchpointPattern(
label="session_db",
file_path="hermes_state.py",
regex=r"class SessionDB",
description="Session persistence surface that could feed TensorZero optimization/eval data.",
),
TouchpointPattern(
label="trajectory_export",
file_path="batch_runner.py",
regex=r"trajectory_entry|save_trajectories|_convert_to_trajectory_format",
description="Trajectory export surface for offline optimization and replay data.",
),
TouchpointPattern(
label="benchmark_suite",
file_path="benchmarks/tool_call_benchmark.py",
regex=r"ToolCall\(|class ToolCall|benchmark",
description="Existing benchmark/evaluation harness that could map to TensorZero experiments.",
),
)
def _iter_matches(pattern: TouchpointPattern, text: str) -> Iterable[Touchpoint]:
regex = re.compile(pattern.regex, re.IGNORECASE)
for line_number, line in enumerate(text.splitlines(), start=1):
if regex.search(line):
yield Touchpoint(
label=pattern.label,
file_path=pattern.file_path,
line_number=line_number,
matched_text=line.strip(),
)
def scan_touchpoints(repo_root: Path) -> list[Touchpoint]:
touchpoints: list[Touchpoint] = []
for pattern in PATTERNS:
path = repo_root / pattern.file_path
if not path.exists():
continue
text = path.read_text(encoding="utf-8")
touchpoints.extend(_iter_matches(pattern, text))
return touchpoints
def build_requirement_matrix(touchpoints: list[Touchpoint]) -> list[RequirementStatus]:
labels = {tp.label for tp in touchpoints}
matrix: list[RequirementStatus] = []
gateway_labels = (
"fallback_chain",
"runtime_provider",
"gateway_provider_routing",
"cron_runtime_provider",
"auxiliary_fallback_chain",
"delegate_runtime_provider",
)
gateway_hits = tuple(label for label in gateway_labels if label in labels)
gateway_status = "partial" if len(gateway_hits) >= 4 else "gap"
gateway_summary = (
"Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; "
"TensorZero would need parity across all of them before it can replace the gateway layer."
if gateway_hits else
"No grounded routing surfaces were found for a gateway replacement assessment."
)
matrix.append(RequirementStatus("gateway_replacement", "Gateway replacement scope", gateway_status, gateway_hits, gateway_summary))
config_labels = (
"provider_routing_config",
"runtime_provider",
"smart_model_routing",
"fallback_chain",
)
config_hits = tuple(label for label in config_labels if label in labels)
config_status = "partial" if len(config_hits) >= 3 else "gap"
config_summary = (
"Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), "
"so TensorZero is not a drop-in config swap."
if config_hits else
"No current config migration surface was found."
)
matrix.append(RequirementStatus("config_migration", "Config migration", config_status, config_hits, config_summary))
canary_hits: tuple[str, ...] = tuple()
canary_summary = (
"The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. "
"A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
)
matrix.append(RequirementStatus("canary_rollout", "10% traffic canary", "gap", canary_hits, canary_summary))
session_labels = ("session_db", "trajectory_export")
session_hits = tuple(label for label in session_labels if label in labels)
session_status = "partial" if len(session_hits) == len(session_labels) else "gap"
session_summary = (
"Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, "
"but not a TensorZero-native ingestion path yet."
if session_hits else
"No session-data surface was found for prompt optimization."
)
matrix.append(RequirementStatus("session_feedback", "Session data for prompt optimization", session_status, session_hits, session_summary))
eval_labels = ("benchmark_suite", "trajectory_export")
eval_hits = tuple(label for label in eval_labels if label in labels)
eval_status = "partial" if "benchmark_suite" in eval_hits else "gap"
eval_summary = (
"Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, "
"but no integrated TensorZero experiment runner or live evaluation gateway."
if eval_hits else
"No evaluation harness was found to support TensorZero A/B testing."
)
matrix.append(RequirementStatus("evaluation_suite", "Evaluation suite / A/B testing", eval_status, eval_hits, eval_summary))
return matrix
def build_report(touchpoints: list[Touchpoint], requirement_matrix: list[RequirementStatus]) -> EvaluationReport:
recommendation = (
"Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, "
"inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, "
"and only design a canary gateway once percentage-based rollout controls exist."
)
return EvaluationReport(
issue_number=ISSUE_NUMBER,
issue_title=ISSUE_TITLE,
issue_url=ISSUE_URL,
recommendation=recommendation,
touchpoints=tuple(touchpoints),
requirements=tuple(requirement_matrix),
)
def build_markdown(report: EvaluationReport) -> str:
lines: list[str] = []
lines.append("# TensorZero Evaluation Packet")
lines.append("")
lines.append(f"Issue #{report.issue_number}: [{report.issue_title}]({report.issue_url})")
lines.append("")
lines.append("## Scope")
lines.append("")
lines.append("This packet evaluates TensorZero as a possible replacement for Hermes' custom provider-routing stack.")
lines.append("It is intentionally grounded in the current repo state rather than a speculative cutover plan.")
lines.append("")
lines.append("## Issue requirements being evaluated")
lines.append("")
lines.append("- Deploy tensorzero gateway (Rust binary)")
lines.append("- Migrate provider routing config")
lines.append("- Test with canary (10% traffic) before full cutover")
lines.append("- Feed session data for prompt optimization")
lines.append("- Evaluation suite for A/B testing models")
lines.append("")
lines.append("## Recommendation")
lines.append("")
lines.append(report.recommendation)
lines.append("")
lines.append("## Requirement matrix")
lines.append("")
lines.append("| Requirement | Status | Evidence labels | Summary |")
lines.append("| --- | --- | --- | --- |")
for row in report.requirements:
evidence = ", ".join(row.evidence_labels) if row.evidence_labels else ""
lines.append(f"| {row.name} | {row.status} | {evidence} | {row.summary} |")
lines.append("")
lines.append("## Grounded Hermes touchpoints")
lines.append("")
if report.touchpoints:
for tp in report.touchpoints:
lines.append(f"- `{tp.file_path}:{tp.line_number}` — [{tp.label}] {tp.matched_text}")
else:
lines.append("- No routing/evaluation touchpoints were found.")
lines.append("")
lines.append("## Suggested next slice")
lines.append("")
lines.append("1. Build an exporter that emits SessionDB + trajectory data into a TensorZero-friendly offline dataset.")
lines.append("2. Define percentage-based canary controls before attempting any gateway replacement.")
lines.append("3. Keep Hermes routing authoritative until TensorZero proves parity across CLI, gateway, cron, auxiliary, and delegation surfaces.")
lines.append("")
return "\n".join(lines).rstrip() + "\n"
def write_outputs(report: EvaluationReport, markdown_path: Path, json_path: Path | None = None) -> None:
markdown_path.parent.mkdir(parents=True, exist_ok=True)
markdown_path.write_text(build_markdown(report), encoding="utf-8")
if json_path is not None:
json_path.parent.mkdir(parents=True, exist_ok=True)
json_path.write_text(json.dumps(asdict(report), indent=2), encoding="utf-8")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate a grounded TensorZero evaluation packet for Hermes")
parser.add_argument("--repo-root", default=".", help="Hermes repo root to scan")
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Markdown output path")
parser.add_argument("--json-output", default=str(DEFAULT_JSON_OUTPUT), help="Optional JSON output path")
return parser.parse_args()
def main() -> int:
args = parse_args()
repo_root = Path(args.repo_root).resolve()
touchpoints = scan_touchpoints(repo_root)
matrix = build_requirement_matrix(touchpoints)
report = build_report(touchpoints, matrix)
json_output = Path(args.json_output) if args.json_output else None
write_outputs(report, Path(args.output), json_output)
print(f"Wrote {args.output}")
if json_output is not None:
print(f"Wrote {json_output}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,149 +0,0 @@
from pathlib import Path
import sys
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
sys.path.insert(0, str(SCRIPT_DIR))
import tensorzero_eval_packet as tz
def test_scan_touchpoints_finds_expected_matches(tmp_path):
(tmp_path / "run_agent.py").write_text(
"self._fallback_chain = []\n# Provider fallback chain\n"
)
(tmp_path / "hermes_cli").mkdir()
(tmp_path / "hermes_cli" / "runtime_provider.py").write_text(
"def resolve_runtime_provider():\n return {}\n"
)
(tmp_path / "agent").mkdir()
(tmp_path / "agent" / "smart_model_routing.py").write_text(
"def resolve_turn_route(user_message, routing_config, primary):\n return primary\n"
)
(tmp_path / "gateway").mkdir()
(tmp_path / "gateway" / "run.py").write_text(
"def _load_provider_routing():\n return {}\n"
)
(tmp_path / "cron").mkdir()
(tmp_path / "cron" / "scheduler.py").write_text(
"runtime = resolve_runtime_provider()\nturn_route = resolve_turn_route('x', {}, {})\n"
)
(tmp_path / "hermes_state.py").write_text("class SessionDB:\n pass\n")
(tmp_path / "benchmarks").mkdir()
(tmp_path / "benchmarks" / "tool_call_benchmark.py").write_text(
"class ToolCall: ...\n"
)
touchpoints = tz.scan_touchpoints(tmp_path)
labels = {tp.label for tp in touchpoints}
assert "fallback_chain" in labels
assert "runtime_provider" in labels
assert "smart_model_routing" in labels
assert "gateway_provider_routing" in labels
assert "cron_runtime_provider" in labels
assert "session_db" in labels
assert "benchmark_suite" in labels
def test_build_requirement_matrix_marks_canary_as_gap_without_split_support():
touchpoints = [
tz.Touchpoint(
label="runtime_provider",
file_path="hermes_cli/runtime_provider.py",
line_number=10,
matched_text="def resolve_runtime_provider",
),
tz.Touchpoint(
label="provider_routing_config",
file_path="cli.py",
line_number=20,
matched_text='provider_routing',
),
tz.Touchpoint(
label="fallback_chain",
file_path="run_agent.py",
line_number=21,
matched_text='_fallback_chain = []',
),
tz.Touchpoint(
label="smart_model_routing",
file_path="agent/smart_model_routing.py",
line_number=30,
matched_text='resolve_turn_route',
),
tz.Touchpoint(
label="gateway_provider_routing",
file_path="gateway/run.py",
line_number=35,
matched_text='def _load_provider_routing',
),
tz.Touchpoint(
label="cron_runtime_provider",
file_path="cron/scheduler.py",
line_number=36,
matched_text='runtime = resolve_runtime_provider()',
),
tz.Touchpoint(
label="session_db",
file_path="hermes_state.py",
line_number=40,
matched_text='class SessionDB',
),
tz.Touchpoint(
label="trajectory_export",
file_path="batch_runner.py",
line_number=50,
matched_text='trajectory_entry',
),
tz.Touchpoint(
label="benchmark_suite",
file_path="benchmarks/tool_call_benchmark.py",
line_number=60,
matched_text='ToolCall',
),
]
matrix = tz.build_requirement_matrix(touchpoints)
by_key = {row.key: row for row in matrix}
assert by_key["gateway_replacement"].status == "partial"
assert by_key["config_migration"].status == "partial"
assert by_key["canary_rollout"].status == "gap"
assert by_key["session_feedback"].status == "partial"
assert by_key["evaluation_suite"].status == "partial"
def test_build_markdown_renders_recommendation_and_touchpoints():
touchpoints = [
tz.Touchpoint(
label="runtime_provider",
file_path="hermes_cli/runtime_provider.py",
line_number=10,
matched_text="def resolve_runtime_provider",
),
tz.Touchpoint(
label="session_db",
file_path="hermes_state.py",
line_number=40,
matched_text='class SessionDB',
),
]
matrix = tz.build_requirement_matrix(touchpoints)
report = tz.build_report(touchpoints, matrix)
markdown = tz.build_markdown(report)
assert "# TensorZero Evaluation Packet" in markdown
assert "gateway_replacement" not in markdown # human labels, not raw keys
assert "Gateway replacement scope" in markdown
assert "Not ready for direct replacement" in markdown
assert "hermes_cli/runtime_provider.py:10" in markdown
assert "hermes_state.py:40" in markdown
def test_issue_context_is_embedded_in_report():
report = tz.build_report([], [])
markdown = tz.build_markdown(report)
assert "Issue #860" in markdown
assert "tensorzero" in markdown.lower()
assert "10% traffic" in markdown

View File

@@ -199,7 +199,7 @@ class TestMarkdown:
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4
assert len(dataset) >= 50
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
@@ -216,6 +216,9 @@ class TestDataset:
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
assert "chart" in categories
assert "ocr" in categories
assert "document" in categories
class TestModels:

View File

@@ -0,0 +1,21 @@
import json
from pathlib import Path
DATASET = Path("benchmarks/test_images.json")
REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
items = json.loads(DATASET.read_text(encoding="utf-8"))
assert len(items) >= 50
categories = {item["category"] for item in items}
assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
def test_metrics_report_exists_with_recommendation() -> None:
assert REPORT.exists(), "missing benchmark report under metrics/"
text = REPORT.read_text(encoding="utf-8")
assert "Recommendation" in text
assert "Gemma 4" in text
assert "Gemini" in text