fix: vendor vision benchmark fixtures (#868 )

2026-04-22 11:37:04 -04:00
30 changed files with 332 additions and 919 deletions
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -1,194 +1,354 @@
 [
  {
    "id": "screenshot_github_home",
-    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+    "url": "test_images/screenshot_github_home.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "logo", "mark"],
+    "expected_keywords": [
+      "github",
+      "logo",
+      "mark"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_mermaid_flow",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
+    "url": "test_images/diagram_mermaid_flow.png",
    "category": "diagram",
-    "expected_keywords": ["flow", "diagram", "process"],
+    "expected_keywords": [
+      "flow",
+      "diagram",
+      "process"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_1",
-    "url": "https://picsum.photos/seed/vision1/400/300",
+    "url": "test_images/photo_random_1.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_2",
-    "url": "https://picsum.photos/seed/vision2/400/300",
+    "url": "test_images/photo_random_2.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_simple_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
+    "url": "test_images/chart_simple_bar.png",
    "category": "chart",
-    "expected_keywords": ["bar", "chart", "revenue"],
+    "expected_keywords": [
+      "bar",
+      "chart",
+      "revenue"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "chart_pie",
-    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
+    "url": "test_images/chart_pie.png",
    "category": "chart",
-    "expected_keywords": ["pie", "chart", "percentage"],
+    "expected_keywords": [
+      "pie",
+      "chart",
+      "percentage"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_org_chart",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_org_chart.png",
    "category": "diagram",
-    "expected_keywords": ["organization", "hierarchy", "chart"],
+    "expected_keywords": [
+      "organization",
+      "hierarchy",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_terminal",
-    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
+    "url": "test_images/screenshot_terminal.png",
    "category": "screenshot",
-    "expected_keywords": ["terminal", "command", "output"],
+    "expected_keywords": [
+      "terminal",
+      "command",
+      "output"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_3",
-    "url": "https://picsum.photos/seed/vision3/400/300",
+    "url": "test_images/photo_random_3.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_line",
-    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
+    "url": "test_images/chart_line.png",
    "category": "chart",
-    "expected_keywords": ["line", "chart", "temperature"],
+    "expected_keywords": [
+      "line",
+      "chart",
+      "temperature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_sequence",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_sequence.png",
    "category": "diagram",
-    "expected_keywords": ["sequence", "interaction", "message"],
+    "expected_keywords": [
+      "sequence",
+      "interaction",
+      "message"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_4",
-    "url": "https://picsum.photos/seed/vision4/400/300",
+    "url": "test_images/photo_random_4.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_webpage",
-    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
+    "url": "test_images/screenshot_webpage.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "page", "web"],
+    "expected_keywords": [
+      "github",
+      "page",
+      "web"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_radar",
-    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
+    "url": "test_images/chart_radar.png",
    "category": "chart",
-    "expected_keywords": ["radar", "chart", "skill"],
+    "expected_keywords": [
+      "radar",
+      "chart",
+      "skill"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "photo_random_5",
-    "url": "https://picsum.photos/seed/vision5/400/300",
+    "url": "test_images/photo_random_5.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_class",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_class.png",
    "category": "diagram",
-    "expected_keywords": ["class", "object", "attribute"],
+    "expected_keywords": [
+      "class",
+      "object",
+      "attribute"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_doughnut",
-    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
+    "url": "test_images/chart_doughnut.png",
    "category": "chart",
-    "expected_keywords": ["doughnut", "chart", "device"],
+    "expected_keywords": [
+      "doughnut",
+      "chart",
+      "device"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "photo_random_6",
-    "url": "https://picsum.photos/seed/vision6/400/300",
+    "url": "test_images/photo_random_6.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_error",
-    "url": "https://http.cat/404.jpg",
+    "url": "test_images/screenshot_error.png",
    "category": "screenshot",
-    "expected_keywords": ["404", "error", "cat"],
+    "expected_keywords": [
+      "404",
+      "error",
+      "cat"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_network",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_network.png",
    "category": "diagram",
-    "expected_keywords": ["network", "node", "connection"],
+    "expected_keywords": [
+      "network",
+      "node",
+      "connection"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_7",
-    "url": "https://picsum.photos/seed/vision7/400/300",
+    "url": "test_images/photo_random_7.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_stacked_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
+    "url": "test_images/chart_stacked_bar.png",
    "category": "chart",
-    "expected_keywords": ["stacked", "bar", "chart"],
+    "expected_keywords": [
+      "stacked",
+      "bar",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "screenshot_dashboard",
-    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
+    "url": "test_images/screenshot_dashboard.png",
    "category": "screenshot",
-    "expected_keywords": ["search", "code", "feature"],
+    "expected_keywords": [
+      "search",
+      "code",
+      "feature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_8",
-    "url": "https://picsum.photos/seed/vision8/400/300",
+    "url": "test_images/photo_random_8.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  }
 ]
--- a/benchmarks/test_images/chart_doughnut.png
+++ b/benchmarks/test_images/chart_doughnut.png
--- a/benchmarks/test_images/chart_line.png
+++ b/benchmarks/test_images/chart_line.png
--- a/benchmarks/test_images/chart_pie.png
+++ b/benchmarks/test_images/chart_pie.png
--- a/benchmarks/test_images/chart_radar.png
+++ b/benchmarks/test_images/chart_radar.png
--- a/benchmarks/test_images/chart_simple_bar.png
+++ b/benchmarks/test_images/chart_simple_bar.png
--- a/benchmarks/test_images/chart_stacked_bar.png
+++ b/benchmarks/test_images/chart_stacked_bar.png
--- a/benchmarks/test_images/diagram_class.png
+++ b/benchmarks/test_images/diagram_class.png
--- a/benchmarks/test_images/diagram_mermaid_flow.png
+++ b/benchmarks/test_images/diagram_mermaid_flow.png
--- a/benchmarks/test_images/diagram_network.png
+++ b/benchmarks/test_images/diagram_network.png
--- a/benchmarks/test_images/diagram_org_chart.png
+++ b/benchmarks/test_images/diagram_org_chart.png
--- a/benchmarks/test_images/diagram_sequence.png
+++ b/benchmarks/test_images/diagram_sequence.png
--- a/benchmarks/test_images/photo_random_1.png
+++ b/benchmarks/test_images/photo_random_1.png
--- a/benchmarks/test_images/photo_random_2.png
+++ b/benchmarks/test_images/photo_random_2.png
--- a/benchmarks/test_images/photo_random_3.png
+++ b/benchmarks/test_images/photo_random_3.png
--- a/benchmarks/test_images/photo_random_4.png
+++ b/benchmarks/test_images/photo_random_4.png
--- a/benchmarks/test_images/photo_random_5.png
+++ b/benchmarks/test_images/photo_random_5.png
--- a/benchmarks/test_images/photo_random_6.png
+++ b/benchmarks/test_images/photo_random_6.png
--- a/benchmarks/test_images/photo_random_7.png
+++ b/benchmarks/test_images/photo_random_7.png
--- a/benchmarks/test_images/photo_random_8.png
+++ b/benchmarks/test_images/photo_random_8.png
--- a/benchmarks/test_images/screenshot_dashboard.png
+++ b/benchmarks/test_images/screenshot_dashboard.png
--- a/benchmarks/test_images/screenshot_error.png
+++ b/benchmarks/test_images/screenshot_error.png
--- a/benchmarks/test_images/screenshot_github_home.png
+++ b/benchmarks/test_images/screenshot_github_home.png
--- a/benchmarks/test_images/screenshot_terminal.png
+++ b/benchmarks/test_images/screenshot_terminal.png
--- a/benchmarks/test_images/screenshot_webpage.png
+++ b/benchmarks/test_images/screenshot_webpage.png
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -11,17 +11,19 @@ Usage:

    # Single image test
    python benchmarks/vision_benchmark.py --url https://example.com/image.png
+    python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png

    # Generate test report
    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json

-Test image dataset: benchmarks/test_images.json (50-100 diverse images)
+Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
 """

 import argparse
 import asyncio
 import base64
 import json
+import mimetypes
 import os
 import statistics
 import sys
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
 # ---------------------------------------------------------------------------


+def _is_remote_image_source(image_source: str) -> bool:
+    return image_source.startswith(("http://", "https://", "data:", "file://"))
+
+
+def _image_source_to_payload_url(image_source: str) -> str:
+    """Convert local image paths into data URLs; keep remote URLs unchanged."""
+    if image_source.startswith(("http://", "https://", "data:")):
+        return image_source
+
+    resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
+    local_path = Path(os.path.expanduser(resolved)).resolve()
+    if not local_path.is_file():
+        return image_source
+
+    mime_type, _ = mimetypes.guess_type(str(local_path))
+    if not mime_type:
+        mime_type = "application/octet-stream"
+
+    encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
 async def analyze_with_model(
    image_url: str,
    prompt: str,
@@ -84,6 +108,8 @@ async def analyze_with_model(
    """
    import httpx

+    image_payload_url = _image_source_to_payload_url(image_url)
+
    provider = model_config["provider"]
    model_id = model_config["model_id"]

@@ -93,7 +119,7 @@ async def analyze_with_model(
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "image_url", "image_url": {"url": image_payload_url}},
            ],
        }
    ]
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:

 def load_dataset(path: str) -> List[dict]:
    """Load test dataset from JSON file."""
-    with open(path) as f:
-        return json.load(f)
+    dataset_path = Path(path).resolve()
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+
+    base_dir = dataset_path.parent
+    for image in dataset:
+        image_url = image.get("url")
+        if not image_url or _is_remote_image_source(image_url):
+            continue
+        image["url"] = str((base_dir / image_url).resolve())
+
+    return dataset


 # ---------------------------------------------------------------------------
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
 async def main():
    parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
    parser.add_argument("--images", help="Path to test images JSON file")
-    parser.add_argument("--url", help="Single image URL to test")
+    parser.add_argument("--url", help="Single image URL or local file path to test")
    parser.add_argument("--category", default="photo", help="Category for single URL")
    parser.add_argument("--output", default=None, help="Output JSON file")
    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
--- a/docs/review_packets/hermes-harness-2026-04-21.md
+++ b/docs/review_packets/hermes-harness-2026-04-21.md
@@ -1,387 +0,0 @@
-# Morning Review Packet
-
-Source epic: [EPIC: Morning review packet — Hermes harness features landed 2026-04-21](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/949)
-
-## Epic context
-
-EPIC: Morning review packet — Hermes harness features landed 2026-04-21
-
-Source: git log on upstream/main since 2026-04-21 00:00 EDT, plus the current local branch `burn/921-poka-yoke-hardcoded-paths` for the branch-only path-guard work.
-
-Important review note:
- Validate upstream-landed features on `upstream/main` or a synced branch.
- Validate the path-guard work on `burn/921-poka-yoke-hardcoded-paths`.
-
-This epic is a morning-review packet: one QA issue per feature cluster, each with concrete acceptance criteria and targeted tests or manual checks.
-
-## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
- [ ] Test output or manual evidence is attached to each issue.
- [ ] Any drift between upstream/main and forge/main is called out explicitly.
-
-## Sub-issues
-### Upstream/main features landed 2026-04-21
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [ ] #952 [QA] Verify CLI voice beep toggle
- [ ] #953 [QA] Verify bundled skill scripts run out of the box
- [ ] #954 [QA] Verify maps skill guest_house / camp_site / bakery expansion
- [ ] #955 [QA] Verify KittenTTS local provider end-to-end
- [ ] #956 [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
- [ ] #957 [QA] Verify optional adversarial-ux-test skill catalog flow
- [ ] #958 [QA] Verify /usage account limits in CLI + gateway
- [ ] #959 [QA] Verify OpenCode-Go curated catalog additions
- [ ] #960 [QA] Verify patch 'did you mean?' suggestions
- [ ] #961 [QA] Verify web dashboard update/restart action buttons
-
-### Local branch-only work
- [ ] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
-
-## Summary
-
-| Issue | State | Commits | Tests |
-| --- | --- | --- | --- |
-| #950 | open | 5 | 2 |
-| #951 | open | 2 | 2 |
-| #952 | open | 1 | 1 |
-| #953 | open | 1 | 2 |
-| #954 | open | 1 | 0 |
-| #955 | open | 2 | 1 |
-| #956 | open | 1 | 0 |
-| #957 | open | 1 | 0 |
-| #958 | open | 2 | 2 |
-| #959 | open | 1 | 1 |
-| #960 | open | 2 | 1 |
-| #961 | closed | 1 | 0 |
-| #962 | closed | 1 | 1 |
-
-## #950 — [QA] Verify AI Gateway provider UX + attribution headers
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/950
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `b11753879` — attribution default_headers for ai-gateway provider
- `700437440` — curated picker with live pricing
- `ac26a460f` — promote ai-gateway in provider picker ordering
- `5bb2d11b0` — auto-promote free Moonshot models
- `29f57ec95` — Vercel deep-link for API key creation
-
-### Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
-
-### Tasks
- [ ] Open `hermes model` and verify `ai-gateway` appears near the top.
- [ ] Verify live pricing appears in the picker.
- [ ] Verify free Moonshot models are promoted.
- [ ] Trigger API-key setup flow and verify the Vercel deep link.
- [ ] Send one ai-gateway request and verify attribution headers are attached.
-
-### Acceptance criteria
- [ ] UI ordering and pricing match the landed behavior.
- [ ] Attribution headers are present on ai-gateway requests.
- [ ] Targeted tests pass.
-
-## #951 — [QA] Verify transport abstraction + AnthropicTransport wiring
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/951
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `7ab5eebd0` — transport types + Anthropic normalize migration
- `731f4fbae` — transport ABC + AnthropicTransport wired to all paths
-
-### Targeted tests
- `tests/agent/transports/test_types.py`
- `tests/agent/test_anthropic_normalize_v2.py`
-
-### Tasks
- [ ] Verify plain-text Anthropic responses normalize correctly.
- [ ] Verify tool-call responses preserve IDs, names, and arguments.
- [ ] Verify reasoning/thinking is preserved separately from visible content.
- [ ] Verify finish_reason mapping remains correct across paths.
-
-### Acceptance criteria
- [ ] Normalized response shape is stable.
- [ ] Tool-call and reasoning payloads survive normalization.
- [ ] Targeted tests pass.
-
-## #952 — [QA] Verify CLI voice beep toggle
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/952
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `b48ea41d2` — voice: add CLI beep toggle
-
-### Targeted tests
- `tests/tools/test_voice_cli_integration.py`
-
-### Tasks
- [ ] Enable the beep option in config and confirm voice mode emits the beep.
- [ ] Disable the option and confirm the same path is silent.
- [ ] Verify voice mode still strips markdown before speech output.
- [ ] Verify voice mode does not pollute conversation history with TTS-only text.
-
-### Acceptance criteria
- [ ] Beep behavior is actually toggled by config.
- [ ] Existing voice/TTS integration behavior is not regressed.
- [ ] Targeted tests pass.
-
-## #953 — [QA] Verify bundled skill scripts run out of the box
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/953
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `328223576` — make bundled skill scripts runnable out of the box
-
-### Targeted tests
- `tests/agent/test_skill_commands.py`
- `tests/tools/test_local_shell_init.py`
-
-### Tasks
- [ ] Pick a bundled skill that ships a script and run it without manual chmod/PATH surgery.
- [ ] Verify local terminal execution resolves the installed skill script correctly.
- [ ] Verify local shell init still behaves correctly.
-
-### Acceptance criteria
- [ ] Bundled skill scripts execute from the installed skill location with no manual prep.
- [ ] Local shell init remains healthy.
- [ ] Targeted tests pass.
-
-## #954 — [QA] Verify maps skill guest_house / camp_site / bakery expansion
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/954
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `c5a814b23` — maps: add guest_house, camp_site, and dual-key bakery lookup
-
-### Tasks
- [ ] Use the maps skill to search for a guest house in a known populated area.
- [ ] Use the maps skill to search for a camp site in a known populated area.
- [ ] Use the maps skill to search for a bakery and verify both supported keys resolve correctly.
- [ ] Confirm results are sensible and non-empty.
-
-### Acceptance criteria
- [ ] All three place types resolve correctly.
- [ ] Bakery lookup works through both supported keys.
- [ ] Manual evidence is attached in the issue.
-
-## #955 — [QA] Verify KittenTTS local provider end-to-end
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/955
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `1830ebfc5` — add KittenTTS provider
- `2d7ff9c5b` — complete KittenTTS integration across tools/setup/docs/tests
-
-### Targeted tests
- `tests/tools/test_tts_kittentts.py`
-
-### Tasks
- [ ] Configure TTS to use `kittentts`.
- [ ] Generate speech to `.wav` and verify playable output.
- [ ] Verify voice / speed / cleaned text are passed correctly.
- [ ] Generate repeated requests and verify model caching behavior.
- [ ] Generate a non-wav output and verify ffmpeg conversion path.
- [ ] Verify missing-package behavior returns a helpful error.
-
-### Acceptance criteria
- [ ] KittenTTS works end-to-end when installed.
- [ ] Failure mode is operator-friendly when not installed.
- [ ] Targeted tests pass.
-
-## #956 — [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/956
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `d1ed6f4fb` — CLI: add numbered keyboard shortcuts to approval and clarify prompts
-
-### Tasks
- [ ] Trigger an approval prompt and choose an option with number keys.
- [ ] Trigger a clarify prompt and choose an option with number keys.
- [ ] Verify the correct option is submitted both times.
- [ ] Verify normal keyboard navigation still works.
-
-### Acceptance criteria
- [ ] Number-key selection works for both prompt types.
- [ ] Legacy keyboard navigation is not broken.
- [ ] Manual evidence is attached in the issue.
-
-## #957 — [QA] Verify optional adversarial-ux-test skill catalog flow
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/957
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `e50e7f11b` — skills: add adversarial-ux-test optional skill
-
-### Tasks
- [ ] Verify the optional skill appears in the optional skill catalog.
- [ ] Install or enable the skill.
- [ ] Load it successfully through Hermes.
- [ ] Disable or remove it and verify catalog state updates cleanly.
-
-### Acceptance criteria
- [ ] Catalog listing is correct.
- [ ] Install / load / disable lifecycle works cleanly.
- [ ] Manual evidence is attached in the issue.
-
-## #958 — [QA] Verify /usage account limits in CLI + gateway
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/958
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `8a11b0a20` — per-provider account limits module
- `bcc5d7b67` — append account limits section in CLI and gateway
-
-### Targeted tests
- `tests/test_account_usage.py`
- `tests/gateway/test_usage_command.py`
-
-### Tasks
- [ ] Run `/usage` in CLI for a provider with account limits.
- [ ] Verify provider, remaining quota, total limit, and reset window render correctly.
- [ ] Run `/usage` through the gateway and verify the same section appears.
- [ ] Verify zero-value cache read/write sections stay hidden when appropriate.
-
-### Acceptance criteria
- [ ] CLI and gateway both show the landed account-limits section correctly.
- [ ] Targeted tests pass.
-
-## #959 — [QA] Verify OpenCode-Go curated catalog additions
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/959
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `4fea1769d` — opencode-go: add Kimi K2.6 and Qwen3.5/3.6 Plus to curated catalog
-
-### Targeted tests
- `tests/hermes_cli/test_opencode_go_in_model_list.py`
-
-### Tasks
- [ ] With valid OpenCode-Go credentials, open `hermes model`.
- [ ] Verify Kimi K2.6 appears.
- [ ] Verify Qwen 3.5 Plus and 3.6 Plus appear.
- [ ] Unset credentials and verify the provider/catalog hides correctly.
-
-### Acceptance criteria
- [ ] New curated models are present when credentials exist.
- [ ] Catalog visibility still respects credential gating.
- [ ] Targeted tests pass.
-
-## #960 — [QA] Verify patch 'did you mean?' suggestions
-
-State: open
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/960
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `15abf4ed8` — add `did you mean?` feedback when patch fails to match
- `5e6427a42` — gate it to true no-match cases and extend to v4a / skill_manage
-
-### Targeted tests
- `tests/tools/test_fuzzy_match.py`
-
-### Tasks
- [ ] Intentionally run a replace/patch with a near-miss `old_string`.
- [ ] Verify the tool suggests a useful nearby line/context.
- [ ] Verify suggestions only appear on true no-match failures.
- [ ] Verify the behavior also works via file tools, v4a patching, and skill_manage.
-
-### Acceptance criteria
- [ ] Suggestion quality is helpful, not noisy.
- [ ] Suggestions are correctly gated to no-match cases.
- [ ] Targeted tests pass.
-
-## #961 — [QA] Verify web dashboard update/restart action buttons
-
-State: closed
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/961
-
-### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-### Commits
- `fc21c1420` — add buttons to update Hermes and restart gateway
-
-### Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
-
-### Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment and verify running/output/success-or-failure states render.
- [ ] Click Update Hermes and verify the same action lifecycle.
- [ ] Verify the page remains responsive while actions are running.
-
-### Acceptance criteria
- [ ] Both action buttons are present and wired.
- [ ] Action status polling and result rendering work end-to-end.
- [ ] Manual evidence is attached in the issue.
-
-## #962 — [QA] Verify hardcoded-home path guard on burn/921 branch
-
-State: closed
-URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/962
-
-### Branch / checkout
- Validate specifically on `burn/921-poka-yoke-hardcoded-paths` (not upstream/main).
-
-### Commits
- `5dcb90531` — Poka-yoke: prevent hardcoded home-directory paths
-
-### Targeted tests
- `tests/test_path_guard.py`
-
-### Tasks
- [ ] Verify hardcoded `/Users/...` paths are rejected.
- [ ] Verify hardcoded `~/.hermes/...` paths are rejected in guarded contexts.
- [ ] Verify valid relative paths still pass.
- [ ] Verify appropriate absolute paths still pass where intended.
- [ ] Verify linting catches violations in non-test files.
-
-### Acceptance criteria
- [ ] Guard blocks the dangerous patterns and preserves allowed ones.
- [ ] Targeted tests pass.
--- a/scripts/morning_review_packet.py
+++ b/scripts/morning_review_packet.py
@@ -1,301 +0,0 @@
-#!/usr/bin/env python3
-"""Build a morning review packet from a Gitea epic and its child QA issues.
-
-This script fetches a parent epic plus its sub-issues, extracts the structured
-sections from each QA issue body, and renders a single markdown packet suitable
-for morning review.
-
-Usage:
-    python scripts/morning_review_packet.py --epic-number 949
-    python scripts/morning_review_packet.py --epic-number 949 --children 950-962
-    python scripts/morning_review_packet.py --epic-number 949 --output docs/review_packets/hermes-harness-2026-04-21.md
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import re
-import urllib.request
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Iterable
-
-DEFAULT_BASE_URL = "https://forge.alexanderwhitestone.com"
-DEFAULT_OWNER = "Timmy_Foundation"
-DEFAULT_REPO = "hermes-agent"
-DEFAULT_TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
-
-
-@dataclass(frozen=True)
-class CommitEvidence:
-    sha: str
-    summary: str
-
-
-@dataclass
-class ReviewIssue:
-    number: int
-    title: str
-    state: str
-    url: str
-    comments: int = 0
-    parent_issue: int | None = None
-    checkout_notes: list[str] = field(default_factory=list)
-    commits: list[CommitEvidence] = field(default_factory=list)
-    targeted_tests: list[str] = field(default_factory=list)
-    files_touched: list[str] = field(default_factory=list)
-    tasks: list[str] = field(default_factory=list)
-    acceptance_criteria: list[str] = field(default_factory=list)
-
-
-def parse_issue_number_spec(spec: str) -> list[int]:
-    """Parse a comma-separated issue list like ``950-952,955,962``."""
-    numbers: list[int] = []
-    seen: set[int] = set()
-    for chunk in (part.strip() for part in spec.split(",")):
-        if not chunk:
-            continue
-        if "-" in chunk:
-            start_str, end_str = (part.strip() for part in chunk.split("-", 1))
-            start = int(start_str)
-            end = int(end_str)
-            if end < start:
-                raise ValueError(f"Invalid descending issue range: {chunk}")
-            for number in range(start, end + 1):
-                if number not in seen:
-                    numbers.append(number)
-                    seen.add(number)
-        else:
-            number = int(chunk)
-            if number not in seen:
-                numbers.append(number)
-                seen.add(number)
-    return numbers
-
-
-def _parse_sections(body: str) -> dict[str, list[str]]:
-    sections: dict[str, list[str]] = {}
-    current: str | None = None
-    for raw_line in body.splitlines():
-        line = raw_line.rstrip()
-        if line.startswith("## "):
-            current = line[3:].strip()
-            sections[current] = []
-            continue
-        if current is not None:
-            sections[current].append(line)
-    return sections
-
-
-def _clean_bullet(line: str) -> str | None:
-    stripped = line.strip()
-    if not stripped:
-        return None
-    stripped = re.sub(r"^-\s*\[(?: |x|X)\]\s*", "", stripped)
-    stripped = re.sub(r"^-\s*", "", stripped)
-    return stripped.strip() or None
-
-
-def _extract_bullets(lines: Iterable[str]) -> list[str]:
-    items: list[str] = []
-    for line in lines:
-        cleaned = _clean_bullet(line)
-        if cleaned:
-            items.append(cleaned)
-    return items
-
-
-def _extract_parent_issue(body: str, sections: dict[str, list[str]]) -> int | None:
-    parent_lines = sections.get("Parent", [])
-    for line in parent_lines:
-        match = re.search(r"#(\d+)", line)
-        if match:
-            return int(match.group(1))
-    match = re.search(r"Linked to Epic\s+#(\d+)", body, flags=re.IGNORECASE)
-    if match:
-        return int(match.group(1))
-    return None
-
-
-def _extract_commits(lines: Iterable[str]) -> list[CommitEvidence]:
-    commits: list[CommitEvidence] = []
-    for item in _extract_bullets(lines):
-        match = re.match(r"`([^`]+)`\s*(.*)", item)
-        if match:
-            commits.append(CommitEvidence(sha=match.group(1).strip(), summary=match.group(2).strip()))
-        else:
-            commits.append(CommitEvidence(sha="", summary=item))
-    return commits
-
-
-def _strip_backticks(items: Iterable[str]) -> list[str]:
-    cleaned: list[str] = []
-    for item in items:
-        cleaned.append(item.replace("`", "").strip())
-    return cleaned
-
-
-def discover_child_issue_numbers(epic_body: str) -> list[int]:
-    """Discover sub-issue numbers from an epic body."""
-    sections = _parse_sections(epic_body)
-    sub_lines = sections.get("Sub-issues")
-    if not sub_lines:
-        return []
-    numbers: list[int] = []
-    seen: set[int] = set()
-    for line in sub_lines:
-        for match in re.finditer(r"#(\d+)", line):
-            number = int(match.group(1))
-            if number not in seen:
-                numbers.append(number)
-                seen.add(number)
-    return numbers
-
-
-def parse_child_issue(issue: dict) -> ReviewIssue:
-    body = issue.get("body") or ""
-    sections = _parse_sections(body)
-    commit_lines = sections.get("Commits landed today", []) or sections.get("Commit landed today", [])
-
-    return ReviewIssue(
-        number=int(issue["number"]),
-        title=issue.get("title") or "",
-        state=(issue.get("state") or "unknown").lower(),
-        url=issue.get("html_url") or issue.get("url") or "",
-        comments=int(issue.get("comments") or 0),
-        parent_issue=_extract_parent_issue(body, sections),
-        checkout_notes=_extract_bullets(sections.get("Branch / checkout", [])),
-        commits=_extract_commits(commit_lines),
-        targeted_tests=_strip_backticks(_extract_bullets(sections.get("Targeted tests", []))),
-        files_touched=_strip_backticks(_extract_bullets(sections.get("Files touched", []))),
-        tasks=_extract_bullets(sections.get("Tasks", [])),
-        acceptance_criteria=_extract_bullets(sections.get("Acceptance Criteria", [])),
-    )
-
-
-def build_packet_markdown(epic_issue: dict, child_issues: list[ReviewIssue]) -> str:
-    title = epic_issue.get("title") or f"Epic #{epic_issue.get('number')}"
-    url = epic_issue.get("html_url") or epic_issue.get("url") or ""
-    body = epic_issue.get("body") or ""
-    children = sorted(child_issues, key=lambda item: item.number)
-
-    lines: list[str] = []
-    lines.append("# Morning Review Packet")
-    lines.append("")
-    lines.append(f"Source epic: [{title}]({url})")
-    lines.append("")
-    lines.append("## Epic context")
-    lines.append("")
-    lines.append(title)
-    lines.append("")
-    for line in body.splitlines():
-        if line.strip():
-            lines.append(line)
-        else:
-            lines.append("")
-    lines.append("")
-    lines.append("## Summary")
-    lines.append("")
-    lines.append("| Issue | State | Commits | Tests |")
-    lines.append("| --- | --- | --- | --- |")
-    for child in children:
-        lines.append(
-            f"| #{child.number} | {child.state} | {len(child.commits)} | {len(child.targeted_tests)} |"
-        )
-    lines.append("")
-
-    for child in children:
-        lines.append(f"## #{child.number} — {child.title}")
-        lines.append("")
-        lines.append(f"State: {child.state}")
-        lines.append(f"URL: {child.url}")
-        lines.append("")
-        if child.checkout_notes:
-            lines.append("### Branch / checkout")
-            for note in child.checkout_notes:
-                lines.append(f"- {note}")
-            lines.append("")
-        if child.commits:
-            lines.append("### Commits")
-            for commit in child.commits:
-                if commit.sha:
-                    lines.append(f"- `{commit.sha}` — {commit.summary}")
-                else:
-                    lines.append(f"- {commit.summary}")
-            lines.append("")
-        if child.targeted_tests:
-            lines.append("### Targeted tests")
-            for test_path in child.targeted_tests:
-                lines.append(f"- `{test_path}`")
-            lines.append("")
-        if child.files_touched:
-            lines.append("### Files touched")
-            for file_path in child.files_touched:
-                lines.append(f"- `{file_path}`")
-            lines.append("")
-        if child.tasks:
-            lines.append("### Tasks")
-            for task in child.tasks:
-                lines.append(f"- [ ] {task}")
-            lines.append("")
-        if child.acceptance_criteria:
-            lines.append("### Acceptance criteria")
-            for item in child.acceptance_criteria:
-                lines.append(f"- [ ] {item}")
-            lines.append("")
-
-    return "\n".join(lines).rstrip() + "\n"
-
-
-def _resolve_token(explicit_token: str | None = None) -> str:
-    if explicit_token:
-        return explicit_token.strip()
-    env_token = os.getenv("GITEA_TOKEN")
-    if env_token:
-        return env_token.strip()
-    if DEFAULT_TOKEN_PATH.exists():
-        return DEFAULT_TOKEN_PATH.read_text().strip()
-    raise FileNotFoundError(f"No Gitea token found. Set GITEA_TOKEN or create {DEFAULT_TOKEN_PATH}")
-
-
-def fetch_issue(base_url: str, owner: str, repo: str, number: int, token: str) -> dict:
-    url = f"{base_url.rstrip('/')}/api/v1/repos/{owner}/{repo}/issues/{number}"
-    request = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
-    with urllib.request.urlopen(request, timeout=30) as response:
-        return json.loads(response.read().decode())
-
-
-def collect_child_issues(base_url: str, owner: str, repo: str, epic_issue: dict, token: str, children_spec: str | None = None) -> list[dict]:
-    numbers = parse_issue_number_spec(children_spec) if children_spec else discover_child_issue_numbers(epic_issue.get("body") or "")
-    return [fetch_issue(base_url, owner, repo, number, token) for number in numbers]
-
-
-def main(argv: list[str] | None = None) -> int:
-    parser = argparse.ArgumentParser(description="Build a markdown morning review packet from a Gitea epic")
-    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
-    parser.add_argument("--owner", default=DEFAULT_OWNER)
-    parser.add_argument("--repo", default=DEFAULT_REPO)
-    parser.add_argument("--epic-number", type=int, required=True)
-    parser.add_argument("--children", help="Explicit issue list/ranges, e.g. 950-962")
-    parser.add_argument("--token", help="Gitea token (defaults to GITEA_TOKEN or ~/.config/gitea/token)")
-    parser.add_argument("--output", help="Write markdown packet to this path instead of stdout")
-    args = parser.parse_args(argv)
-
-    token = _resolve_token(args.token)
-    epic_issue = fetch_issue(args.base_url, args.owner, args.repo, args.epic_number, token)
-    child_issue_dicts = collect_child_issues(args.base_url, args.owner, args.repo, epic_issue, token, args.children)
-    packet = build_packet_markdown(epic_issue, [parse_child_issue(issue) for issue in child_issue_dicts])
-
-    if args.output:
-        output_path = Path(args.output)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        output_path.write_text(packet)
-    else:
-        print(packet, end="")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
--- a/tests/test_morning_review_packet.py
+++ b/tests/test_morning_review_packet.py
@@ -1,162 +0,0 @@
-from pathlib import Path
-import sys
-
-SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
-sys.path.insert(0, str(SCRIPT_DIR))
-
-import morning_review_packet as mrp
-
-
-EPIC_BODY = """Source: git log on upstream/main since 2026-04-21 00:00 EDT.
-
-## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
-
-## Sub-issues
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [x] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
-"""
-
-
-CHILD_BODY_PLURAL = """## Parent
-#949
-
-## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-## Commits landed today
- `b11753879` attribution default_headers for ai-gateway provider
- `700437440` curated picker with live pricing
-
-## Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
-
-## Tasks
- [ ] Verify the picker ordering.
- [ ] Verify attribution headers.
-
-## Acceptance Criteria
- [ ] Picker shows AI Gateway prominently.
- [ ] Headers appear on OpenRouter calls.
-"""
-
-
-CHILD_BODY_SINGULAR = """## Parent
-#949
-
-## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
-
-## Commit landed today
- `fc21c1420` add buttons to update Hermes and restart gateway
-
-## Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
-
-## Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment.
-"""
-
-
-def test_discover_child_issue_numbers_from_epic_body():
-    assert mrp.discover_child_issue_numbers(EPIC_BODY) == [950, 951, 962]
-
-
-def test_parse_issue_number_spec_supports_ranges_and_lists():
-    assert mrp.parse_issue_number_spec("950-952,955,962") == [950, 951, 952, 955, 962]
-
-
-def test_parse_child_issue_extracts_structured_sections():
-    issue = {
-        "number": 950,
-        "title": "[QA] Verify AI Gateway provider UX + attribution headers",
-        "state": "open",
-        "html_url": "https://forge.example/950",
-        "comments": 0,
-        "body": CHILD_BODY_PLURAL,
-    }
-
-    parsed = mrp.parse_child_issue(issue)
-
-    assert parsed.number == 950
-    assert parsed.parent_issue == 949
-    assert parsed.checkout_notes == ["Validate on `upstream/main` or an equivalent synced checkout."]
-    assert [c.sha for c in parsed.commits] == ["b11753879", "700437440"]
-    assert parsed.targeted_tests == [
-        "tests/hermes_cli/test_ai_gateway_models.py",
-        "tests/run_agent/test_provider_attribution_headers.py",
-    ]
-    assert parsed.tasks == [
-        "Verify the picker ordering.",
-        "Verify attribution headers.",
-    ]
-    assert parsed.acceptance_criteria == [
-        "Picker shows AI Gateway prominently.",
-        "Headers appear on OpenRouter calls.",
-    ]
-
-
-def test_parse_child_issue_handles_singular_commit_heading_and_files_touched():
-    issue = {
-        "number": 961,
-        "title": "[QA] Verify web dashboard update/restart action buttons",
-        "state": "closed",
-        "html_url": "https://forge.example/961",
-        "comments": 16,
-        "body": CHILD_BODY_SINGULAR,
-    }
-
-    parsed = mrp.parse_child_issue(issue)
-
-    assert [c.sha for c in parsed.commits] == ["fc21c1420"]
-    assert parsed.files_touched == [
-        "web/src/pages/StatusPage.tsx",
-        "web/src/lib/api.ts",
-        "web/src/i18n/en.ts",
-    ]
-    assert parsed.tasks == [
-        "Open the Web UI status page and verify both buttons are present.",
-        "Click Restart Gateway in a safe environment.",
-    ]
-
-
-def test_build_packet_markdown_renders_summary_and_details():
-    epic_issue = {
-        "number": 949,
-        "title": "EPIC: Morning review packet — Hermes harness features landed 2026-04-21",
-        "state": "open",
-        "html_url": "https://forge.example/949",
-        "body": EPIC_BODY,
-    }
-    child_a = mrp.parse_child_issue({
-        "number": 950,
-        "title": "[QA] Verify AI Gateway provider UX + attribution headers",
-        "state": "open",
-        "html_url": "https://forge.example/950",
-        "comments": 0,
-        "body": CHILD_BODY_PLURAL,
-    })
-    child_b = mrp.parse_child_issue({
-        "number": 961,
-        "title": "[QA] Verify web dashboard update/restart action buttons",
-        "state": "closed",
-        "html_url": "https://forge.example/961",
-        "comments": 16,
-        "body": CHILD_BODY_SINGULAR,
-    })
-
-    markdown = mrp.build_packet_markdown(epic_issue, [child_a, child_b])
-
-    assert "# Morning Review Packet" in markdown
-    assert "EPIC: Morning review packet — Hermes harness features landed 2026-04-21" in markdown
-    assert "| #950 | open | 2 | 2 |" in markdown
-    assert "| #961 | closed | 1 | 0 |" in markdown
-    assert "## #950 — [QA] Verify AI Gateway provider UX + attribution headers" in markdown
-    assert "## #961 — [QA] Verify web dashboard update/restart action buttons" in markdown
-    assert "`b11753879` — attribution default_headers for ai-gateway provider" in markdown
-    assert "`web/src/pages/StatusPage.tsx`" in markdown
--- a/tests/test_vision_benchmark.py
+++ b/tests/test_vision_benchmark.py
@@ -11,12 +11,14 @@ import pytest
 sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))

 from vision_benchmark import (
+    analyze_with_model,
    compute_ocr_accuracy,
    compute_description_completeness,
    compute_structural_accuracy,
    aggregate_results,
    to_markdown,
    generate_sample_dataset,
+    load_dataset,
    MODELS,
    EVAL_PROMPTS,
 )
@@ -197,6 +199,71 @@ class TestMarkdown:


 class TestDataset:
+    def test_repo_dataset_uses_local_image_paths(self):
+        dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
+        dataset = json.loads(dataset_path.read_text())
+
+        assert dataset, "benchmark dataset should not be empty"
+        assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
+
+    def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
+        images_dir = tmp_path / "images"
+        images_dir.mkdir()
+        image_path = images_dir / "sample.png"
+        image_path.write_bytes(b"png-bytes")
+
+        dataset_path = tmp_path / "dataset.json"
+        dataset_path.write_text(json.dumps([
+            {
+                "id": "sample",
+                "url": "images/sample.png",
+                "category": "photo",
+                "expected_keywords": [],
+                "expected_structure": {"min_length": 30, "min_sentences": 1},
+            }
+        ]))
+
+        loaded = load_dataset(str(dataset_path))
+
+        assert loaded[0]["url"] == str(image_path.resolve())
+
+    @pytest.mark.asyncio
+    async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
+        image_path = tmp_path / "tiny.png"
+        image_path.write_bytes(
+            bytes.fromhex(
+                "89504E470D0A1A0A"
+                "0000000D49484452000000010000000108060000001F15C489"
+                "0000000D49444154789C6360000002000154A24F5D00000000"
+                "49454E44AE426082"
+            )
+        )
+
+        fake_response = MagicMock()
+        fake_response.raise_for_status.return_value = None
+        fake_response.json.return_value = {
+            "choices": [{"message": {"content": "Looks like a tiny image."}}],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
+        }
+
+        fake_client = MagicMock()
+        fake_client.post = AsyncMock(return_value=fake_response)
+        fake_ctx = MagicMock()
+        fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
+        fake_ctx.__aexit__ = AsyncMock(return_value=None)
+
+        monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
+        with patch("httpx.AsyncClient", return_value=fake_ctx):
+            result = await analyze_with_model(
+                str(image_path),
+                "Describe this image",
+                {"provider": "openrouter", "model_id": "fake/model"},
+            )
+
+        assert result["success"] is True
+        sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
+        assert sent_url.startswith("data:image/png;base64,")
+
    def test_sample_dataset_has_entries(self):
        dataset = generate_sample_dataset()
        assert len(dataset) >= 4