Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a814f5bef |
@@ -1,194 +1,354 @@
|
||||
[
|
||||
{
|
||||
"id": "screenshot_github_home",
|
||||
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
|
||||
"url": "test_images/screenshot_github_home.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["github", "logo", "mark"],
|
||||
"expected_keywords": [
|
||||
"github",
|
||||
"logo",
|
||||
"mark"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "diagram_mermaid_flow",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
|
||||
"url": "test_images/diagram_mermaid_flow.png",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["flow", "diagram", "process"],
|
||||
"expected_keywords": [
|
||||
"flow",
|
||||
"diagram",
|
||||
"process"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_1",
|
||||
"url": "https://picsum.photos/seed/vision1/400/300",
|
||||
"url": "test_images/photo_random_1.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_2",
|
||||
"url": "https://picsum.photos/seed/vision2/400/300",
|
||||
"url": "test_images/photo_random_2.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_simple_bar",
|
||||
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
|
||||
"url": "test_images/chart_simple_bar.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["bar", "chart", "revenue"],
|
||||
"expected_keywords": [
|
||||
"bar",
|
||||
"chart",
|
||||
"revenue"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_pie",
|
||||
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
|
||||
"url": "test_images/chart_pie.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["pie", "chart", "percentage"],
|
||||
"expected_keywords": [
|
||||
"pie",
|
||||
"chart",
|
||||
"percentage"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "diagram_org_chart",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"url": "test_images/diagram_org_chart.png",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["organization", "hierarchy", "chart"],
|
||||
"expected_keywords": [
|
||||
"organization",
|
||||
"hierarchy",
|
||||
"chart"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_terminal",
|
||||
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
|
||||
"url": "test_images/screenshot_terminal.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["terminal", "command", "output"],
|
||||
"expected_keywords": [
|
||||
"terminal",
|
||||
"command",
|
||||
"output"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_3",
|
||||
"url": "https://picsum.photos/seed/vision3/400/300",
|
||||
"url": "test_images/photo_random_3.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_line",
|
||||
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
|
||||
"url": "test_images/chart_line.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["line", "chart", "temperature"],
|
||||
"expected_keywords": [
|
||||
"line",
|
||||
"chart",
|
||||
"temperature"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "diagram_sequence",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"url": "test_images/diagram_sequence.png",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["sequence", "interaction", "message"],
|
||||
"expected_keywords": [
|
||||
"sequence",
|
||||
"interaction",
|
||||
"message"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_4",
|
||||
"url": "https://picsum.photos/seed/vision4/400/300",
|
||||
"url": "test_images/photo_random_4.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_webpage",
|
||||
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
|
||||
"url": "test_images/screenshot_webpage.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["github", "page", "web"],
|
||||
"expected_keywords": [
|
||||
"github",
|
||||
"page",
|
||||
"web"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_radar",
|
||||
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
|
||||
"url": "test_images/chart_radar.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["radar", "chart", "skill"],
|
||||
"expected_keywords": [
|
||||
"radar",
|
||||
"chart",
|
||||
"skill"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_5",
|
||||
"url": "https://picsum.photos/seed/vision5/400/300",
|
||||
"url": "test_images/photo_random_5.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "diagram_class",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"url": "test_images/diagram_class.png",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["class", "object", "attribute"],
|
||||
"expected_keywords": [
|
||||
"class",
|
||||
"object",
|
||||
"attribute"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_doughnut",
|
||||
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
|
||||
"url": "test_images/chart_doughnut.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["doughnut", "chart", "device"],
|
||||
"expected_keywords": [
|
||||
"doughnut",
|
||||
"chart",
|
||||
"device"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_6",
|
||||
"url": "https://picsum.photos/seed/vision6/400/300",
|
||||
"url": "test_images/photo_random_6.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_error",
|
||||
"url": "https://http.cat/404.jpg",
|
||||
"url": "test_images/screenshot_error.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["404", "error", "cat"],
|
||||
"expected_keywords": [
|
||||
"404",
|
||||
"error",
|
||||
"cat"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "diagram_network",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"url": "test_images/diagram_network.png",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["network", "node", "connection"],
|
||||
"expected_keywords": [
|
||||
"network",
|
||||
"node",
|
||||
"connection"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_7",
|
||||
"url": "https://picsum.photos/seed/vision7/400/300",
|
||||
"url": "test_images/photo_random_7.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "chart_stacked_bar",
|
||||
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
|
||||
"url": "test_images/chart_stacked_bar.png",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["stacked", "bar", "chart"],
|
||||
"expected_keywords": [
|
||||
"stacked",
|
||||
"bar",
|
||||
"chart"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
"expected_structure": {
|
||||
"min_length": 50,
|
||||
"min_sentences": 2,
|
||||
"has_numbers": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_dashboard",
|
||||
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
|
||||
"url": "test_images/screenshot_dashboard.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["search", "code", "feature"],
|
||||
"expected_keywords": [
|
||||
"search",
|
||||
"code",
|
||||
"feature"
|
||||
],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_8",
|
||||
"url": "https://picsum.photos/seed/vision8/400/300",
|
||||
"url": "test_images/photo_random_8.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
"expected_structure": {
|
||||
"min_length": 30,
|
||||
"min_sentences": 1,
|
||||
"has_numbers": false
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
BIN
benchmarks/test_images/chart_doughnut.png
Normal file
|
After Width: | Height: | Size: 4.4 KiB |
BIN
benchmarks/test_images/chart_line.png
Normal file
|
After Width: | Height: | Size: 4.1 KiB |
BIN
benchmarks/test_images/chart_pie.png
Normal file
|
After Width: | Height: | Size: 4.0 KiB |
BIN
benchmarks/test_images/chart_radar.png
Normal file
|
After Width: | Height: | Size: 3.5 KiB |
BIN
benchmarks/test_images/chart_simple_bar.png
Normal file
|
After Width: | Height: | Size: 4.2 KiB |
BIN
benchmarks/test_images/chart_stacked_bar.png
Normal file
|
After Width: | Height: | Size: 5.0 KiB |
BIN
benchmarks/test_images/diagram_class.png
Normal file
|
After Width: | Height: | Size: 4.6 KiB |
BIN
benchmarks/test_images/diagram_mermaid_flow.png
Normal file
|
After Width: | Height: | Size: 4.8 KiB |
BIN
benchmarks/test_images/diagram_network.png
Normal file
|
After Width: | Height: | Size: 5.0 KiB |
BIN
benchmarks/test_images/diagram_org_chart.png
Normal file
|
After Width: | Height: | Size: 5.1 KiB |
BIN
benchmarks/test_images/diagram_sequence.png
Normal file
|
After Width: | Height: | Size: 5.2 KiB |
BIN
benchmarks/test_images/photo_random_1.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_2.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_3.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_4.png
Normal file
|
After Width: | Height: | Size: 2.9 KiB |
BIN
benchmarks/test_images/photo_random_5.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_6.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_7.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/photo_random_8.png
Normal file
|
After Width: | Height: | Size: 3.0 KiB |
BIN
benchmarks/test_images/screenshot_dashboard.png
Normal file
|
After Width: | Height: | Size: 7.1 KiB |
BIN
benchmarks/test_images/screenshot_error.png
Normal file
|
After Width: | Height: | Size: 6.2 KiB |
BIN
benchmarks/test_images/screenshot_github_home.png
Normal file
|
After Width: | Height: | Size: 7.1 KiB |
BIN
benchmarks/test_images/screenshot_terminal.png
Normal file
|
After Width: | Height: | Size: 7.1 KiB |
BIN
benchmarks/test_images/screenshot_webpage.png
Normal file
|
After Width: | Height: | Size: 7.2 KiB |
@@ -11,17 +11,19 @@ Usage:
|
||||
|
||||
# Single image test
|
||||
python benchmarks/vision_benchmark.py --url https://example.com/image.png
|
||||
python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png
|
||||
|
||||
# Generate test report
|
||||
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
|
||||
|
||||
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
|
||||
Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import mimetypes
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_remote_image_source(image_source: str) -> bool:
|
||||
return image_source.startswith(("http://", "https://", "data:", "file://"))
|
||||
|
||||
|
||||
def _image_source_to_payload_url(image_source: str) -> str:
|
||||
"""Convert local image paths into data URLs; keep remote URLs unchanged."""
|
||||
if image_source.startswith(("http://", "https://", "data:")):
|
||||
return image_source
|
||||
|
||||
resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
|
||||
local_path = Path(os.path.expanduser(resolved)).resolve()
|
||||
if not local_path.is_file():
|
||||
return image_source
|
||||
|
||||
mime_type, _ = mimetypes.guess_type(str(local_path))
|
||||
if not mime_type:
|
||||
mime_type = "application/octet-stream"
|
||||
|
||||
encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
|
||||
|
||||
async def analyze_with_model(
|
||||
image_url: str,
|
||||
prompt: str,
|
||||
@@ -84,6 +108,8 @@ async def analyze_with_model(
|
||||
"""
|
||||
import httpx
|
||||
|
||||
image_payload_url = _image_source_to_payload_url(image_url)
|
||||
|
||||
provider = model_config["provider"]
|
||||
model_id = model_config["model_id"]
|
||||
|
||||
@@ -93,7 +119,7 @@ async def analyze_with_model(
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{"type": "image_url", "image_url": {"url": image_payload_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:
|
||||
|
||||
def load_dataset(path: str) -> List[dict]:
|
||||
"""Load test dataset from JSON file."""
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
dataset_path = Path(path).resolve()
|
||||
with open(dataset_path) as f:
|
||||
dataset = json.load(f)
|
||||
|
||||
base_dir = dataset_path.parent
|
||||
for image in dataset:
|
||||
image_url = image.get("url")
|
||||
if not image_url or _is_remote_image_source(image_url):
|
||||
continue
|
||||
image["url"] = str((base_dir / image_url).resolve())
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
|
||||
parser.add_argument("--images", help="Path to test images JSON file")
|
||||
parser.add_argument("--url", help="Single image URL to test")
|
||||
parser.add_argument("--url", help="Single image URL or local file path to test")
|
||||
parser.add_argument("--category", default="photo", help="Category for single URL")
|
||||
parser.add_argument("--output", default=None, help="Output JSON file")
|
||||
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
|
||||
|
||||
@@ -1,387 +0,0 @@
|
||||
# Morning Review Packet
|
||||
|
||||
Source epic: [EPIC: Morning review packet — Hermes harness features landed 2026-04-21](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/949)
|
||||
|
||||
## Epic context
|
||||
|
||||
EPIC: Morning review packet — Hermes harness features landed 2026-04-21
|
||||
|
||||
Source: git log on upstream/main since 2026-04-21 00:00 EDT, plus the current local branch `burn/921-poka-yoke-hardcoded-paths` for the branch-only path-guard work.
|
||||
|
||||
Important review note:
|
||||
- Validate upstream-landed features on `upstream/main` or a synced branch.
|
||||
- Validate the path-guard work on `burn/921-poka-yoke-hardcoded-paths`.
|
||||
|
||||
This epic is a morning-review packet: one QA issue per feature cluster, each with concrete acceptance criteria and targeted tests or manual checks.
|
||||
|
||||
## Success criteria
|
||||
- [ ] Every issue has a clear PASS / FAIL outcome.
|
||||
- [ ] Test output or manual evidence is attached to each issue.
|
||||
- [ ] Any drift between upstream/main and forge/main is called out explicitly.
|
||||
|
||||
## Sub-issues
|
||||
### Upstream/main features landed 2026-04-21
|
||||
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
|
||||
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
|
||||
- [ ] #952 [QA] Verify CLI voice beep toggle
|
||||
- [ ] #953 [QA] Verify bundled skill scripts run out of the box
|
||||
- [ ] #954 [QA] Verify maps skill guest_house / camp_site / bakery expansion
|
||||
- [ ] #955 [QA] Verify KittenTTS local provider end-to-end
|
||||
- [ ] #956 [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
|
||||
- [ ] #957 [QA] Verify optional adversarial-ux-test skill catalog flow
|
||||
- [ ] #958 [QA] Verify /usage account limits in CLI + gateway
|
||||
- [ ] #959 [QA] Verify OpenCode-Go curated catalog additions
|
||||
- [ ] #960 [QA] Verify patch 'did you mean?' suggestions
|
||||
- [ ] #961 [QA] Verify web dashboard update/restart action buttons
|
||||
|
||||
### Local branch-only work
|
||||
- [ ] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
|
||||
|
||||
## Summary
|
||||
|
||||
| Issue | State | Commits | Tests |
|
||||
| --- | --- | --- | --- |
|
||||
| #950 | open | 5 | 2 |
|
||||
| #951 | open | 2 | 2 |
|
||||
| #952 | open | 1 | 1 |
|
||||
| #953 | open | 1 | 2 |
|
||||
| #954 | open | 1 | 0 |
|
||||
| #955 | open | 2 | 1 |
|
||||
| #956 | open | 1 | 0 |
|
||||
| #957 | open | 1 | 0 |
|
||||
| #958 | open | 2 | 2 |
|
||||
| #959 | open | 1 | 1 |
|
||||
| #960 | open | 2 | 1 |
|
||||
| #961 | closed | 1 | 0 |
|
||||
| #962 | closed | 1 | 1 |
|
||||
|
||||
## #950 — [QA] Verify AI Gateway provider UX + attribution headers
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/950
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `b11753879` — attribution default_headers for ai-gateway provider
|
||||
- `700437440` — curated picker with live pricing
|
||||
- `ac26a460f` — promote ai-gateway in provider picker ordering
|
||||
- `5bb2d11b0` — auto-promote free Moonshot models
|
||||
- `29f57ec95` — Vercel deep-link for API key creation
|
||||
|
||||
### Targeted tests
|
||||
- `tests/hermes_cli/test_ai_gateway_models.py`
|
||||
- `tests/run_agent/test_provider_attribution_headers.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Open `hermes model` and verify `ai-gateway` appears near the top.
|
||||
- [ ] Verify live pricing appears in the picker.
|
||||
- [ ] Verify free Moonshot models are promoted.
|
||||
- [ ] Trigger API-key setup flow and verify the Vercel deep link.
|
||||
- [ ] Send one ai-gateway request and verify attribution headers are attached.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] UI ordering and pricing match the landed behavior.
|
||||
- [ ] Attribution headers are present on ai-gateway requests.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #951 — [QA] Verify transport abstraction + AnthropicTransport wiring
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/951
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `7ab5eebd0` — transport types + Anthropic normalize migration
|
||||
- `731f4fbae` — transport ABC + AnthropicTransport wired to all paths
|
||||
|
||||
### Targeted tests
|
||||
- `tests/agent/transports/test_types.py`
|
||||
- `tests/agent/test_anthropic_normalize_v2.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Verify plain-text Anthropic responses normalize correctly.
|
||||
- [ ] Verify tool-call responses preserve IDs, names, and arguments.
|
||||
- [ ] Verify reasoning/thinking is preserved separately from visible content.
|
||||
- [ ] Verify finish_reason mapping remains correct across paths.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Normalized response shape is stable.
|
||||
- [ ] Tool-call and reasoning payloads survive normalization.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #952 — [QA] Verify CLI voice beep toggle
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/952
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `b48ea41d2` — voice: add CLI beep toggle
|
||||
|
||||
### Targeted tests
|
||||
- `tests/tools/test_voice_cli_integration.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Enable the beep option in config and confirm voice mode emits the beep.
|
||||
- [ ] Disable the option and confirm the same path is silent.
|
||||
- [ ] Verify voice mode still strips markdown before speech output.
|
||||
- [ ] Verify voice mode does not pollute conversation history with TTS-only text.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Beep behavior is actually toggled by config.
|
||||
- [ ] Existing voice/TTS integration behavior is not regressed.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #953 — [QA] Verify bundled skill scripts run out of the box
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/953
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `328223576` — make bundled skill scripts runnable out of the box
|
||||
|
||||
### Targeted tests
|
||||
- `tests/agent/test_skill_commands.py`
|
||||
- `tests/tools/test_local_shell_init.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Pick a bundled skill that ships a script and run it without manual chmod/PATH surgery.
|
||||
- [ ] Verify local terminal execution resolves the installed skill script correctly.
|
||||
- [ ] Verify local shell init still behaves correctly.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Bundled skill scripts execute from the installed skill location with no manual prep.
|
||||
- [ ] Local shell init remains healthy.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #954 — [QA] Verify maps skill guest_house / camp_site / bakery expansion
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/954
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `c5a814b23` — maps: add guest_house, camp_site, and dual-key bakery lookup
|
||||
|
||||
### Tasks
|
||||
- [ ] Use the maps skill to search for a guest house in a known populated area.
|
||||
- [ ] Use the maps skill to search for a camp site in a known populated area.
|
||||
- [ ] Use the maps skill to search for a bakery and verify both supported keys resolve correctly.
|
||||
- [ ] Confirm results are sensible and non-empty.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] All three place types resolve correctly.
|
||||
- [ ] Bakery lookup works through both supported keys.
|
||||
- [ ] Manual evidence is attached in the issue.
|
||||
|
||||
## #955 — [QA] Verify KittenTTS local provider end-to-end
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/955
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `1830ebfc5` — add KittenTTS provider
|
||||
- `2d7ff9c5b` — complete KittenTTS integration across tools/setup/docs/tests
|
||||
|
||||
### Targeted tests
|
||||
- `tests/tools/test_tts_kittentts.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Configure TTS to use `kittentts`.
|
||||
- [ ] Generate speech to `.wav` and verify playable output.
|
||||
- [ ] Verify voice / speed / cleaned text are passed correctly.
|
||||
- [ ] Generate repeated requests and verify model caching behavior.
|
||||
- [ ] Generate a non-wav output and verify ffmpeg conversion path.
|
||||
- [ ] Verify missing-package behavior returns a helpful error.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] KittenTTS works end-to-end when installed.
|
||||
- [ ] Failure mode is operator-friendly when not installed.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #956 — [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/956
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `d1ed6f4fb` — CLI: add numbered keyboard shortcuts to approval and clarify prompts
|
||||
|
||||
### Tasks
|
||||
- [ ] Trigger an approval prompt and choose an option with number keys.
|
||||
- [ ] Trigger a clarify prompt and choose an option with number keys.
|
||||
- [ ] Verify the correct option is submitted both times.
|
||||
- [ ] Verify normal keyboard navigation still works.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Number-key selection works for both prompt types.
|
||||
- [ ] Legacy keyboard navigation is not broken.
|
||||
- [ ] Manual evidence is attached in the issue.
|
||||
|
||||
## #957 — [QA] Verify optional adversarial-ux-test skill catalog flow
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/957
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `e50e7f11b` — skills: add adversarial-ux-test optional skill
|
||||
|
||||
### Tasks
|
||||
- [ ] Verify the optional skill appears in the optional skill catalog.
|
||||
- [ ] Install or enable the skill.
|
||||
- [ ] Load it successfully through Hermes.
|
||||
- [ ] Disable or remove it and verify catalog state updates cleanly.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Catalog listing is correct.
|
||||
- [ ] Install / load / disable lifecycle works cleanly.
|
||||
- [ ] Manual evidence is attached in the issue.
|
||||
|
||||
## #958 — [QA] Verify /usage account limits in CLI + gateway
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/958
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `8a11b0a20` — per-provider account limits module
|
||||
- `bcc5d7b67` — append account limits section in CLI and gateway
|
||||
|
||||
### Targeted tests
|
||||
- `tests/test_account_usage.py`
|
||||
- `tests/gateway/test_usage_command.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Run `/usage` in CLI for a provider with account limits.
|
||||
- [ ] Verify provider, remaining quota, total limit, and reset window render correctly.
|
||||
- [ ] Run `/usage` through the gateway and verify the same section appears.
|
||||
- [ ] Verify zero-value cache read/write sections stay hidden when appropriate.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] CLI and gateway both show the landed account-limits section correctly.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #959 — [QA] Verify OpenCode-Go curated catalog additions
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/959
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `4fea1769d` — opencode-go: add Kimi K2.6 and Qwen3.5/3.6 Plus to curated catalog
|
||||
|
||||
### Targeted tests
|
||||
- `tests/hermes_cli/test_opencode_go_in_model_list.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] With valid OpenCode-Go credentials, open `hermes model`.
|
||||
- [ ] Verify Kimi K2.6 appears.
|
||||
- [ ] Verify Qwen 3.5 Plus and 3.6 Plus appear.
|
||||
- [ ] Unset credentials and verify the provider/catalog hides correctly.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] New curated models are present when credentials exist.
|
||||
- [ ] Catalog visibility still respects credential gating.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #960 — [QA] Verify patch 'did you mean?' suggestions
|
||||
|
||||
State: open
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/960
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `15abf4ed8` — add `did you mean?` feedback when patch fails to match
|
||||
- `5e6427a42` — gate it to true no-match cases and extend to v4a / skill_manage
|
||||
|
||||
### Targeted tests
|
||||
- `tests/tools/test_fuzzy_match.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Intentionally run a replace/patch with a near-miss `old_string`.
|
||||
- [ ] Verify the tool suggests a useful nearby line/context.
|
||||
- [ ] Verify suggestions only appear on true no-match failures.
|
||||
- [ ] Verify the behavior also works via file tools, v4a patching, and skill_manage.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Suggestion quality is helpful, not noisy.
|
||||
- [ ] Suggestions are correctly gated to no-match cases.
|
||||
- [ ] Targeted tests pass.
|
||||
|
||||
## #961 — [QA] Verify web dashboard update/restart action buttons
|
||||
|
||||
State: closed
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/961
|
||||
|
||||
### Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
### Commits
|
||||
- `fc21c1420` — add buttons to update Hermes and restart gateway
|
||||
|
||||
### Files touched
|
||||
- `web/src/pages/StatusPage.tsx`
|
||||
- `web/src/lib/api.ts`
|
||||
- `web/src/i18n/en.ts`
|
||||
|
||||
### Tasks
|
||||
- [ ] Open the Web UI status page and verify both buttons are present.
|
||||
- [ ] Click Restart Gateway in a safe environment and verify running/output/success-or-failure states render.
|
||||
- [ ] Click Update Hermes and verify the same action lifecycle.
|
||||
- [ ] Verify the page remains responsive while actions are running.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Both action buttons are present and wired.
|
||||
- [ ] Action status polling and result rendering work end-to-end.
|
||||
- [ ] Manual evidence is attached in the issue.
|
||||
|
||||
## #962 — [QA] Verify hardcoded-home path guard on burn/921 branch
|
||||
|
||||
State: closed
|
||||
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/962
|
||||
|
||||
### Branch / checkout
|
||||
- Validate specifically on `burn/921-poka-yoke-hardcoded-paths` (not upstream/main).
|
||||
|
||||
### Commits
|
||||
- `5dcb90531` — Poka-yoke: prevent hardcoded home-directory paths
|
||||
|
||||
### Targeted tests
|
||||
- `tests/test_path_guard.py`
|
||||
|
||||
### Tasks
|
||||
- [ ] Verify hardcoded `/Users/...` paths are rejected.
|
||||
- [ ] Verify hardcoded `~/.hermes/...` paths are rejected in guarded contexts.
|
||||
- [ ] Verify valid relative paths still pass.
|
||||
- [ ] Verify appropriate absolute paths still pass where intended.
|
||||
- [ ] Verify linting catches violations in non-test files.
|
||||
|
||||
### Acceptance criteria
|
||||
- [ ] Guard blocks the dangerous patterns and preserves allowed ones.
|
||||
- [ ] Targeted tests pass.
|
||||
@@ -1,301 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build a morning review packet from a Gitea epic and its child QA issues.
|
||||
|
||||
This script fetches a parent epic plus its sub-issues, extracts the structured
|
||||
sections from each QA issue body, and renders a single markdown packet suitable
|
||||
for morning review.
|
||||
|
||||
Usage:
|
||||
python scripts/morning_review_packet.py --epic-number 949
|
||||
python scripts/morning_review_packet.py --epic-number 949 --children 950-962
|
||||
python scripts/morning_review_packet.py --epic-number 949 --output docs/review_packets/hermes-harness-2026-04-21.md
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
DEFAULT_BASE_URL = "https://forge.alexanderwhitestone.com"
|
||||
DEFAULT_OWNER = "Timmy_Foundation"
|
||||
DEFAULT_REPO = "hermes-agent"
|
||||
DEFAULT_TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CommitEvidence:
|
||||
sha: str
|
||||
summary: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReviewIssue:
|
||||
number: int
|
||||
title: str
|
||||
state: str
|
||||
url: str
|
||||
comments: int = 0
|
||||
parent_issue: int | None = None
|
||||
checkout_notes: list[str] = field(default_factory=list)
|
||||
commits: list[CommitEvidence] = field(default_factory=list)
|
||||
targeted_tests: list[str] = field(default_factory=list)
|
||||
files_touched: list[str] = field(default_factory=list)
|
||||
tasks: list[str] = field(default_factory=list)
|
||||
acceptance_criteria: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def parse_issue_number_spec(spec: str) -> list[int]:
|
||||
"""Parse a comma-separated issue list like ``950-952,955,962``."""
|
||||
numbers: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for chunk in (part.strip() for part in spec.split(",")):
|
||||
if not chunk:
|
||||
continue
|
||||
if "-" in chunk:
|
||||
start_str, end_str = (part.strip() for part in chunk.split("-", 1))
|
||||
start = int(start_str)
|
||||
end = int(end_str)
|
||||
if end < start:
|
||||
raise ValueError(f"Invalid descending issue range: {chunk}")
|
||||
for number in range(start, end + 1):
|
||||
if number not in seen:
|
||||
numbers.append(number)
|
||||
seen.add(number)
|
||||
else:
|
||||
number = int(chunk)
|
||||
if number not in seen:
|
||||
numbers.append(number)
|
||||
seen.add(number)
|
||||
return numbers
|
||||
|
||||
|
||||
def _parse_sections(body: str) -> dict[str, list[str]]:
|
||||
sections: dict[str, list[str]] = {}
|
||||
current: str | None = None
|
||||
for raw_line in body.splitlines():
|
||||
line = raw_line.rstrip()
|
||||
if line.startswith("## "):
|
||||
current = line[3:].strip()
|
||||
sections[current] = []
|
||||
continue
|
||||
if current is not None:
|
||||
sections[current].append(line)
|
||||
return sections
|
||||
|
||||
|
||||
def _clean_bullet(line: str) -> str | None:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
return None
|
||||
stripped = re.sub(r"^-\s*\[(?: |x|X)\]\s*", "", stripped)
|
||||
stripped = re.sub(r"^-\s*", "", stripped)
|
||||
return stripped.strip() or None
|
||||
|
||||
|
||||
def _extract_bullets(lines: Iterable[str]) -> list[str]:
|
||||
items: list[str] = []
|
||||
for line in lines:
|
||||
cleaned = _clean_bullet(line)
|
||||
if cleaned:
|
||||
items.append(cleaned)
|
||||
return items
|
||||
|
||||
|
||||
def _extract_parent_issue(body: str, sections: dict[str, list[str]]) -> int | None:
|
||||
parent_lines = sections.get("Parent", [])
|
||||
for line in parent_lines:
|
||||
match = re.search(r"#(\d+)", line)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
match = re.search(r"Linked to Epic\s+#(\d+)", body, flags=re.IGNORECASE)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_commits(lines: Iterable[str]) -> list[CommitEvidence]:
|
||||
commits: list[CommitEvidence] = []
|
||||
for item in _extract_bullets(lines):
|
||||
match = re.match(r"`([^`]+)`\s*(.*)", item)
|
||||
if match:
|
||||
commits.append(CommitEvidence(sha=match.group(1).strip(), summary=match.group(2).strip()))
|
||||
else:
|
||||
commits.append(CommitEvidence(sha="", summary=item))
|
||||
return commits
|
||||
|
||||
|
||||
def _strip_backticks(items: Iterable[str]) -> list[str]:
|
||||
cleaned: list[str] = []
|
||||
for item in items:
|
||||
cleaned.append(item.replace("`", "").strip())
|
||||
return cleaned
|
||||
|
||||
|
||||
def discover_child_issue_numbers(epic_body: str) -> list[int]:
|
||||
"""Discover sub-issue numbers from an epic body."""
|
||||
sections = _parse_sections(epic_body)
|
||||
sub_lines = sections.get("Sub-issues")
|
||||
if not sub_lines:
|
||||
return []
|
||||
numbers: list[int] = []
|
||||
seen: set[int] = set()
|
||||
for line in sub_lines:
|
||||
for match in re.finditer(r"#(\d+)", line):
|
||||
number = int(match.group(1))
|
||||
if number not in seen:
|
||||
numbers.append(number)
|
||||
seen.add(number)
|
||||
return numbers
|
||||
|
||||
|
||||
def parse_child_issue(issue: dict) -> ReviewIssue:
|
||||
body = issue.get("body") or ""
|
||||
sections = _parse_sections(body)
|
||||
commit_lines = sections.get("Commits landed today", []) or sections.get("Commit landed today", [])
|
||||
|
||||
return ReviewIssue(
|
||||
number=int(issue["number"]),
|
||||
title=issue.get("title") or "",
|
||||
state=(issue.get("state") or "unknown").lower(),
|
||||
url=issue.get("html_url") or issue.get("url") or "",
|
||||
comments=int(issue.get("comments") or 0),
|
||||
parent_issue=_extract_parent_issue(body, sections),
|
||||
checkout_notes=_extract_bullets(sections.get("Branch / checkout", [])),
|
||||
commits=_extract_commits(commit_lines),
|
||||
targeted_tests=_strip_backticks(_extract_bullets(sections.get("Targeted tests", []))),
|
||||
files_touched=_strip_backticks(_extract_bullets(sections.get("Files touched", []))),
|
||||
tasks=_extract_bullets(sections.get("Tasks", [])),
|
||||
acceptance_criteria=_extract_bullets(sections.get("Acceptance Criteria", [])),
|
||||
)
|
||||
|
||||
|
||||
def build_packet_markdown(epic_issue: dict, child_issues: list[ReviewIssue]) -> str:
|
||||
title = epic_issue.get("title") or f"Epic #{epic_issue.get('number')}"
|
||||
url = epic_issue.get("html_url") or epic_issue.get("url") or ""
|
||||
body = epic_issue.get("body") or ""
|
||||
children = sorted(child_issues, key=lambda item: item.number)
|
||||
|
||||
lines: list[str] = []
|
||||
lines.append("# Morning Review Packet")
|
||||
lines.append("")
|
||||
lines.append(f"Source epic: [{title}]({url})")
|
||||
lines.append("")
|
||||
lines.append("## Epic context")
|
||||
lines.append("")
|
||||
lines.append(title)
|
||||
lines.append("")
|
||||
for line in body.splitlines():
|
||||
if line.strip():
|
||||
lines.append(line)
|
||||
else:
|
||||
lines.append("")
|
||||
lines.append("")
|
||||
lines.append("## Summary")
|
||||
lines.append("")
|
||||
lines.append("| Issue | State | Commits | Tests |")
|
||||
lines.append("| --- | --- | --- | --- |")
|
||||
for child in children:
|
||||
lines.append(
|
||||
f"| #{child.number} | {child.state} | {len(child.commits)} | {len(child.targeted_tests)} |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
for child in children:
|
||||
lines.append(f"## #{child.number} — {child.title}")
|
||||
lines.append("")
|
||||
lines.append(f"State: {child.state}")
|
||||
lines.append(f"URL: {child.url}")
|
||||
lines.append("")
|
||||
if child.checkout_notes:
|
||||
lines.append("### Branch / checkout")
|
||||
for note in child.checkout_notes:
|
||||
lines.append(f"- {note}")
|
||||
lines.append("")
|
||||
if child.commits:
|
||||
lines.append("### Commits")
|
||||
for commit in child.commits:
|
||||
if commit.sha:
|
||||
lines.append(f"- `{commit.sha}` — {commit.summary}")
|
||||
else:
|
||||
lines.append(f"- {commit.summary}")
|
||||
lines.append("")
|
||||
if child.targeted_tests:
|
||||
lines.append("### Targeted tests")
|
||||
for test_path in child.targeted_tests:
|
||||
lines.append(f"- `{test_path}`")
|
||||
lines.append("")
|
||||
if child.files_touched:
|
||||
lines.append("### Files touched")
|
||||
for file_path in child.files_touched:
|
||||
lines.append(f"- `{file_path}`")
|
||||
lines.append("")
|
||||
if child.tasks:
|
||||
lines.append("### Tasks")
|
||||
for task in child.tasks:
|
||||
lines.append(f"- [ ] {task}")
|
||||
lines.append("")
|
||||
if child.acceptance_criteria:
|
||||
lines.append("### Acceptance criteria")
|
||||
for item in child.acceptance_criteria:
|
||||
lines.append(f"- [ ] {item}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).rstrip() + "\n"
|
||||
|
||||
|
||||
def _resolve_token(explicit_token: str | None = None) -> str:
|
||||
if explicit_token:
|
||||
return explicit_token.strip()
|
||||
env_token = os.getenv("GITEA_TOKEN")
|
||||
if env_token:
|
||||
return env_token.strip()
|
||||
if DEFAULT_TOKEN_PATH.exists():
|
||||
return DEFAULT_TOKEN_PATH.read_text().strip()
|
||||
raise FileNotFoundError(f"No Gitea token found. Set GITEA_TOKEN or create {DEFAULT_TOKEN_PATH}")
|
||||
|
||||
|
||||
def fetch_issue(base_url: str, owner: str, repo: str, number: int, token: str) -> dict:
|
||||
url = f"{base_url.rstrip('/')}/api/v1/repos/{owner}/{repo}/issues/{number}"
|
||||
request = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
return json.loads(response.read().decode())
|
||||
|
||||
|
||||
def collect_child_issues(base_url: str, owner: str, repo: str, epic_issue: dict, token: str, children_spec: str | None = None) -> list[dict]:
|
||||
numbers = parse_issue_number_spec(children_spec) if children_spec else discover_child_issue_numbers(epic_issue.get("body") or "")
|
||||
return [fetch_issue(base_url, owner, repo, number, token) for number in numbers]
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Build a markdown morning review packet from a Gitea epic")
|
||||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||||
parser.add_argument("--owner", default=DEFAULT_OWNER)
|
||||
parser.add_argument("--repo", default=DEFAULT_REPO)
|
||||
parser.add_argument("--epic-number", type=int, required=True)
|
||||
parser.add_argument("--children", help="Explicit issue list/ranges, e.g. 950-962")
|
||||
parser.add_argument("--token", help="Gitea token (defaults to GITEA_TOKEN or ~/.config/gitea/token)")
|
||||
parser.add_argument("--output", help="Write markdown packet to this path instead of stdout")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
token = _resolve_token(args.token)
|
||||
epic_issue = fetch_issue(args.base_url, args.owner, args.repo, args.epic_number, token)
|
||||
child_issue_dicts = collect_child_issues(args.base_url, args.owner, args.repo, epic_issue, token, args.children)
|
||||
packet = build_packet_markdown(epic_issue, [parse_child_issue(issue) for issue in child_issue_dicts])
|
||||
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(packet)
|
||||
else:
|
||||
print(packet, end="")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,162 +0,0 @@
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
import morning_review_packet as mrp
|
||||
|
||||
|
||||
EPIC_BODY = """Source: git log on upstream/main since 2026-04-21 00:00 EDT.
|
||||
|
||||
## Success criteria
|
||||
- [ ] Every issue has a clear PASS / FAIL outcome.
|
||||
|
||||
## Sub-issues
|
||||
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
|
||||
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
|
||||
- [x] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
|
||||
"""
|
||||
|
||||
|
||||
CHILD_BODY_PLURAL = """## Parent
|
||||
#949
|
||||
|
||||
## Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
## Commits landed today
|
||||
- `b11753879` attribution default_headers for ai-gateway provider
|
||||
- `700437440` curated picker with live pricing
|
||||
|
||||
## Targeted tests
|
||||
- `tests/hermes_cli/test_ai_gateway_models.py`
|
||||
- `tests/run_agent/test_provider_attribution_headers.py`
|
||||
|
||||
## Tasks
|
||||
- [ ] Verify the picker ordering.
|
||||
- [ ] Verify attribution headers.
|
||||
|
||||
## Acceptance Criteria
|
||||
- [ ] Picker shows AI Gateway prominently.
|
||||
- [ ] Headers appear on OpenRouter calls.
|
||||
"""
|
||||
|
||||
|
||||
CHILD_BODY_SINGULAR = """## Parent
|
||||
#949
|
||||
|
||||
## Branch / checkout
|
||||
- Validate on `upstream/main` or an equivalent synced checkout.
|
||||
|
||||
## Commit landed today
|
||||
- `fc21c1420` add buttons to update Hermes and restart gateway
|
||||
|
||||
## Files touched
|
||||
- `web/src/pages/StatusPage.tsx`
|
||||
- `web/src/lib/api.ts`
|
||||
- `web/src/i18n/en.ts`
|
||||
|
||||
## Tasks
|
||||
- [ ] Open the Web UI status page and verify both buttons are present.
|
||||
- [ ] Click Restart Gateway in a safe environment.
|
||||
"""
|
||||
|
||||
|
||||
def test_discover_child_issue_numbers_from_epic_body():
|
||||
assert mrp.discover_child_issue_numbers(EPIC_BODY) == [950, 951, 962]
|
||||
|
||||
|
||||
def test_parse_issue_number_spec_supports_ranges_and_lists():
|
||||
assert mrp.parse_issue_number_spec("950-952,955,962") == [950, 951, 952, 955, 962]
|
||||
|
||||
|
||||
def test_parse_child_issue_extracts_structured_sections():
|
||||
issue = {
|
||||
"number": 950,
|
||||
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
|
||||
"state": "open",
|
||||
"html_url": "https://forge.example/950",
|
||||
"comments": 0,
|
||||
"body": CHILD_BODY_PLURAL,
|
||||
}
|
||||
|
||||
parsed = mrp.parse_child_issue(issue)
|
||||
|
||||
assert parsed.number == 950
|
||||
assert parsed.parent_issue == 949
|
||||
assert parsed.checkout_notes == ["Validate on `upstream/main` or an equivalent synced checkout."]
|
||||
assert [c.sha for c in parsed.commits] == ["b11753879", "700437440"]
|
||||
assert parsed.targeted_tests == [
|
||||
"tests/hermes_cli/test_ai_gateway_models.py",
|
||||
"tests/run_agent/test_provider_attribution_headers.py",
|
||||
]
|
||||
assert parsed.tasks == [
|
||||
"Verify the picker ordering.",
|
||||
"Verify attribution headers.",
|
||||
]
|
||||
assert parsed.acceptance_criteria == [
|
||||
"Picker shows AI Gateway prominently.",
|
||||
"Headers appear on OpenRouter calls.",
|
||||
]
|
||||
|
||||
|
||||
def test_parse_child_issue_handles_singular_commit_heading_and_files_touched():
|
||||
issue = {
|
||||
"number": 961,
|
||||
"title": "[QA] Verify web dashboard update/restart action buttons",
|
||||
"state": "closed",
|
||||
"html_url": "https://forge.example/961",
|
||||
"comments": 16,
|
||||
"body": CHILD_BODY_SINGULAR,
|
||||
}
|
||||
|
||||
parsed = mrp.parse_child_issue(issue)
|
||||
|
||||
assert [c.sha for c in parsed.commits] == ["fc21c1420"]
|
||||
assert parsed.files_touched == [
|
||||
"web/src/pages/StatusPage.tsx",
|
||||
"web/src/lib/api.ts",
|
||||
"web/src/i18n/en.ts",
|
||||
]
|
||||
assert parsed.tasks == [
|
||||
"Open the Web UI status page and verify both buttons are present.",
|
||||
"Click Restart Gateway in a safe environment.",
|
||||
]
|
||||
|
||||
|
||||
def test_build_packet_markdown_renders_summary_and_details():
|
||||
epic_issue = {
|
||||
"number": 949,
|
||||
"title": "EPIC: Morning review packet — Hermes harness features landed 2026-04-21",
|
||||
"state": "open",
|
||||
"html_url": "https://forge.example/949",
|
||||
"body": EPIC_BODY,
|
||||
}
|
||||
child_a = mrp.parse_child_issue({
|
||||
"number": 950,
|
||||
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
|
||||
"state": "open",
|
||||
"html_url": "https://forge.example/950",
|
||||
"comments": 0,
|
||||
"body": CHILD_BODY_PLURAL,
|
||||
})
|
||||
child_b = mrp.parse_child_issue({
|
||||
"number": 961,
|
||||
"title": "[QA] Verify web dashboard update/restart action buttons",
|
||||
"state": "closed",
|
||||
"html_url": "https://forge.example/961",
|
||||
"comments": 16,
|
||||
"body": CHILD_BODY_SINGULAR,
|
||||
})
|
||||
|
||||
markdown = mrp.build_packet_markdown(epic_issue, [child_a, child_b])
|
||||
|
||||
assert "# Morning Review Packet" in markdown
|
||||
assert "EPIC: Morning review packet — Hermes harness features landed 2026-04-21" in markdown
|
||||
assert "| #950 | open | 2 | 2 |" in markdown
|
||||
assert "| #961 | closed | 1 | 0 |" in markdown
|
||||
assert "## #950 — [QA] Verify AI Gateway provider UX + attribution headers" in markdown
|
||||
assert "## #961 — [QA] Verify web dashboard update/restart action buttons" in markdown
|
||||
assert "`b11753879` — attribution default_headers for ai-gateway provider" in markdown
|
||||
assert "`web/src/pages/StatusPage.tsx`" in markdown
|
||||
@@ -11,12 +11,14 @@ import pytest
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
|
||||
|
||||
from vision_benchmark import (
|
||||
analyze_with_model,
|
||||
compute_ocr_accuracy,
|
||||
compute_description_completeness,
|
||||
compute_structural_accuracy,
|
||||
aggregate_results,
|
||||
to_markdown,
|
||||
generate_sample_dataset,
|
||||
load_dataset,
|
||||
MODELS,
|
||||
EVAL_PROMPTS,
|
||||
)
|
||||
@@ -197,6 +199,71 @@ class TestMarkdown:
|
||||
|
||||
|
||||
class TestDataset:
|
||||
def test_repo_dataset_uses_local_image_paths(self):
|
||||
dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
|
||||
dataset = json.loads(dataset_path.read_text())
|
||||
|
||||
assert dataset, "benchmark dataset should not be empty"
|
||||
assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
|
||||
|
||||
def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
|
||||
images_dir = tmp_path / "images"
|
||||
images_dir.mkdir()
|
||||
image_path = images_dir / "sample.png"
|
||||
image_path.write_bytes(b"png-bytes")
|
||||
|
||||
dataset_path = tmp_path / "dataset.json"
|
||||
dataset_path.write_text(json.dumps([
|
||||
{
|
||||
"id": "sample",
|
||||
"url": "images/sample.png",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1},
|
||||
}
|
||||
]))
|
||||
|
||||
loaded = load_dataset(str(dataset_path))
|
||||
|
||||
assert loaded[0]["url"] == str(image_path.resolve())
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
|
||||
image_path = tmp_path / "tiny.png"
|
||||
image_path.write_bytes(
|
||||
bytes.fromhex(
|
||||
"89504E470D0A1A0A"
|
||||
"0000000D49484452000000010000000108060000001F15C489"
|
||||
"0000000D49444154789C6360000002000154A24F5D00000000"
|
||||
"49454E44AE426082"
|
||||
)
|
||||
)
|
||||
|
||||
fake_response = MagicMock()
|
||||
fake_response.raise_for_status.return_value = None
|
||||
fake_response.json.return_value = {
|
||||
"choices": [{"message": {"content": "Looks like a tiny image."}}],
|
||||
"usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
|
||||
}
|
||||
|
||||
fake_client = MagicMock()
|
||||
fake_client.post = AsyncMock(return_value=fake_response)
|
||||
fake_ctx = MagicMock()
|
||||
fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
|
||||
fake_ctx.__aexit__ = AsyncMock(return_value=None)
|
||||
|
||||
monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
|
||||
with patch("httpx.AsyncClient", return_value=fake_ctx):
|
||||
result = await analyze_with_model(
|
||||
str(image_path),
|
||||
"Describe this image",
|
||||
{"provider": "openrouter", "model_id": "fake/model"},
|
||||
)
|
||||
|
||||
assert result["success"] is True
|
||||
sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
|
||||
assert sent_url.startswith("data:image/png;base64,")
|
||||
|
||||
def test_sample_dataset_has_entries(self):
|
||||
dataset = generate_sample_dataset()
|
||||
assert len(dataset) >= 4
|
||||
|
||||