Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
0a814f5bef fix: vendor vision benchmark fixtures (#868)
All checks were successful
Lint / lint (pull_request) Successful in 11s
2026-04-22 11:37:04 -04:00
30 changed files with 332 additions and 919 deletions

View File

@@ -1,194 +1,354 @@
[
{
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"url": "test_images/screenshot_github_home.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"expected_keywords": [
"github",
"logo",
"mark"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"url": "test_images/diagram_mermaid_flow.png",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"expected_keywords": [
"flow",
"diagram",
"process"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"url": "test_images/photo_random_1.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"url": "test_images/photo_random_2.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"url": "test_images/chart_simple_bar.png",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"url": "test_images/chart_pie.png",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_org_chart.png",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"expected_keywords": [
"organization",
"hierarchy",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"url": "test_images/screenshot_terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"url": "test_images/photo_random_3.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"url": "test_images/chart_line.png",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"expected_keywords": [
"line",
"chart",
"temperature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_sequence.png",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"expected_keywords": [
"sequence",
"interaction",
"message"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"url": "test_images/photo_random_4.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"url": "test_images/screenshot_webpage.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"url": "test_images/chart_radar.png",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"expected_keywords": [
"radar",
"chart",
"skill"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"url": "test_images/photo_random_5.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_class.png",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"expected_keywords": [
"class",
"object",
"attribute"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"url": "test_images/chart_doughnut.png",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"url": "test_images/photo_random_6.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"url": "test_images/screenshot_error.png",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"url": "test_images/diagram_network.png",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"expected_keywords": [
"network",
"node",
"connection"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"url": "test_images/photo_random_7.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"url": "test_images/chart_stacked_bar.png",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"url": "test_images/screenshot_dashboard.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"url": "test_images/photo_random_8.png",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
}
]

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.2 KiB

View File

@@ -11,17 +11,19 @@ Usage:
# Single image test
python benchmarks/vision_benchmark.py --url https://example.com/image.png
python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png
# Generate test report
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
"""
import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
# ---------------------------------------------------------------------------
def _is_remote_image_source(image_source: str) -> bool:
return image_source.startswith(("http://", "https://", "data:", "file://"))
def _image_source_to_payload_url(image_source: str) -> str:
"""Convert local image paths into data URLs; keep remote URLs unchanged."""
if image_source.startswith(("http://", "https://", "data:")):
return image_source
resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
local_path = Path(os.path.expanduser(resolved)).resolve()
if not local_path.is_file():
return image_source
mime_type, _ = mimetypes.guess_type(str(local_path))
if not mime_type:
mime_type = "application/octet-stream"
encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
async def analyze_with_model(
image_url: str,
prompt: str,
@@ -84,6 +108,8 @@ async def analyze_with_model(
"""
import httpx
image_payload_url = _image_source_to_payload_url(image_url)
provider = model_config["provider"]
model_id = model_config["model_id"]
@@ -93,7 +119,7 @@ async def analyze_with_model(
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "image_url", "image_url": {"url": image_payload_url}},
],
}
]
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:
def load_dataset(path: str) -> List[dict]:
"""Load test dataset from JSON file."""
with open(path) as f:
return json.load(f)
dataset_path = Path(path).resolve()
with open(dataset_path) as f:
dataset = json.load(f)
base_dir = dataset_path.parent
for image in dataset:
image_url = image.get("url")
if not image_url or _is_remote_image_source(image_url):
continue
image["url"] = str((base_dir / image_url).resolve())
return dataset
# ---------------------------------------------------------------------------
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
async def main():
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
parser.add_argument("--images", help="Path to test images JSON file")
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--url", help="Single image URL or local file path to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")

View File

@@ -1,387 +0,0 @@
# Morning Review Packet
Source epic: [EPIC: Morning review packet — Hermes harness features landed 2026-04-21](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/949)
## Epic context
EPIC: Morning review packet — Hermes harness features landed 2026-04-21
Source: git log on upstream/main since 2026-04-21 00:00 EDT, plus the current local branch `burn/921-poka-yoke-hardcoded-paths` for the branch-only path-guard work.
Important review note:
- Validate upstream-landed features on `upstream/main` or a synced branch.
- Validate the path-guard work on `burn/921-poka-yoke-hardcoded-paths`.
This epic is a morning-review packet: one QA issue per feature cluster, each with concrete acceptance criteria and targeted tests or manual checks.
## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
- [ ] Test output or manual evidence is attached to each issue.
- [ ] Any drift between upstream/main and forge/main is called out explicitly.
## Sub-issues
### Upstream/main features landed 2026-04-21
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [ ] #952 [QA] Verify CLI voice beep toggle
- [ ] #953 [QA] Verify bundled skill scripts run out of the box
- [ ] #954 [QA] Verify maps skill guest_house / camp_site / bakery expansion
- [ ] #955 [QA] Verify KittenTTS local provider end-to-end
- [ ] #956 [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
- [ ] #957 [QA] Verify optional adversarial-ux-test skill catalog flow
- [ ] #958 [QA] Verify /usage account limits in CLI + gateway
- [ ] #959 [QA] Verify OpenCode-Go curated catalog additions
- [ ] #960 [QA] Verify patch 'did you mean?' suggestions
- [ ] #961 [QA] Verify web dashboard update/restart action buttons
### Local branch-only work
- [ ] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
## Summary
| Issue | State | Commits | Tests |
| --- | --- | --- | --- |
| #950 | open | 5 | 2 |
| #951 | open | 2 | 2 |
| #952 | open | 1 | 1 |
| #953 | open | 1 | 2 |
| #954 | open | 1 | 0 |
| #955 | open | 2 | 1 |
| #956 | open | 1 | 0 |
| #957 | open | 1 | 0 |
| #958 | open | 2 | 2 |
| #959 | open | 1 | 1 |
| #960 | open | 2 | 1 |
| #961 | closed | 1 | 0 |
| #962 | closed | 1 | 1 |
## #950 — [QA] Verify AI Gateway provider UX + attribution headers
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/950
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b11753879` — attribution default_headers for ai-gateway provider
- `700437440` — curated picker with live pricing
- `ac26a460f` — promote ai-gateway in provider picker ordering
- `5bb2d11b0` — auto-promote free Moonshot models
- `29f57ec95` — Vercel deep-link for API key creation
### Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
### Tasks
- [ ] Open `hermes model` and verify `ai-gateway` appears near the top.
- [ ] Verify live pricing appears in the picker.
- [ ] Verify free Moonshot models are promoted.
- [ ] Trigger API-key setup flow and verify the Vercel deep link.
- [ ] Send one ai-gateway request and verify attribution headers are attached.
### Acceptance criteria
- [ ] UI ordering and pricing match the landed behavior.
- [ ] Attribution headers are present on ai-gateway requests.
- [ ] Targeted tests pass.
## #951 — [QA] Verify transport abstraction + AnthropicTransport wiring
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/951
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `7ab5eebd0` — transport types + Anthropic normalize migration
- `731f4fbae` — transport ABC + AnthropicTransport wired to all paths
### Targeted tests
- `tests/agent/transports/test_types.py`
- `tests/agent/test_anthropic_normalize_v2.py`
### Tasks
- [ ] Verify plain-text Anthropic responses normalize correctly.
- [ ] Verify tool-call responses preserve IDs, names, and arguments.
- [ ] Verify reasoning/thinking is preserved separately from visible content.
- [ ] Verify finish_reason mapping remains correct across paths.
### Acceptance criteria
- [ ] Normalized response shape is stable.
- [ ] Tool-call and reasoning payloads survive normalization.
- [ ] Targeted tests pass.
## #952 — [QA] Verify CLI voice beep toggle
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/952
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b48ea41d2` — voice: add CLI beep toggle
### Targeted tests
- `tests/tools/test_voice_cli_integration.py`
### Tasks
- [ ] Enable the beep option in config and confirm voice mode emits the beep.
- [ ] Disable the option and confirm the same path is silent.
- [ ] Verify voice mode still strips markdown before speech output.
- [ ] Verify voice mode does not pollute conversation history with TTS-only text.
### Acceptance criteria
- [ ] Beep behavior is actually toggled by config.
- [ ] Existing voice/TTS integration behavior is not regressed.
- [ ] Targeted tests pass.
## #953 — [QA] Verify bundled skill scripts run out of the box
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/953
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `328223576` — make bundled skill scripts runnable out of the box
### Targeted tests
- `tests/agent/test_skill_commands.py`
- `tests/tools/test_local_shell_init.py`
### Tasks
- [ ] Pick a bundled skill that ships a script and run it without manual chmod/PATH surgery.
- [ ] Verify local terminal execution resolves the installed skill script correctly.
- [ ] Verify local shell init still behaves correctly.
### Acceptance criteria
- [ ] Bundled skill scripts execute from the installed skill location with no manual prep.
- [ ] Local shell init remains healthy.
- [ ] Targeted tests pass.
## #954 — [QA] Verify maps skill guest_house / camp_site / bakery expansion
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/954
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `c5a814b23` — maps: add guest_house, camp_site, and dual-key bakery lookup
### Tasks
- [ ] Use the maps skill to search for a guest house in a known populated area.
- [ ] Use the maps skill to search for a camp site in a known populated area.
- [ ] Use the maps skill to search for a bakery and verify both supported keys resolve correctly.
- [ ] Confirm results are sensible and non-empty.
### Acceptance criteria
- [ ] All three place types resolve correctly.
- [ ] Bakery lookup works through both supported keys.
- [ ] Manual evidence is attached in the issue.
## #955 — [QA] Verify KittenTTS local provider end-to-end
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/955
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `1830ebfc5` — add KittenTTS provider
- `2d7ff9c5b` — complete KittenTTS integration across tools/setup/docs/tests
### Targeted tests
- `tests/tools/test_tts_kittentts.py`
### Tasks
- [ ] Configure TTS to use `kittentts`.
- [ ] Generate speech to `.wav` and verify playable output.
- [ ] Verify voice / speed / cleaned text are passed correctly.
- [ ] Generate repeated requests and verify model caching behavior.
- [ ] Generate a non-wav output and verify ffmpeg conversion path.
- [ ] Verify missing-package behavior returns a helpful error.
### Acceptance criteria
- [ ] KittenTTS works end-to-end when installed.
- [ ] Failure mode is operator-friendly when not installed.
- [ ] Targeted tests pass.
## #956 — [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/956
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `d1ed6f4fb` — CLI: add numbered keyboard shortcuts to approval and clarify prompts
### Tasks
- [ ] Trigger an approval prompt and choose an option with number keys.
- [ ] Trigger a clarify prompt and choose an option with number keys.
- [ ] Verify the correct option is submitted both times.
- [ ] Verify normal keyboard navigation still works.
### Acceptance criteria
- [ ] Number-key selection works for both prompt types.
- [ ] Legacy keyboard navigation is not broken.
- [ ] Manual evidence is attached in the issue.
## #957 — [QA] Verify optional adversarial-ux-test skill catalog flow
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/957
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `e50e7f11b` — skills: add adversarial-ux-test optional skill
### Tasks
- [ ] Verify the optional skill appears in the optional skill catalog.
- [ ] Install or enable the skill.
- [ ] Load it successfully through Hermes.
- [ ] Disable or remove it and verify catalog state updates cleanly.
### Acceptance criteria
- [ ] Catalog listing is correct.
- [ ] Install / load / disable lifecycle works cleanly.
- [ ] Manual evidence is attached in the issue.
## #958 — [QA] Verify /usage account limits in CLI + gateway
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/958
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `8a11b0a20` — per-provider account limits module
- `bcc5d7b67` — append account limits section in CLI and gateway
### Targeted tests
- `tests/test_account_usage.py`
- `tests/gateway/test_usage_command.py`
### Tasks
- [ ] Run `/usage` in CLI for a provider with account limits.
- [ ] Verify provider, remaining quota, total limit, and reset window render correctly.
- [ ] Run `/usage` through the gateway and verify the same section appears.
- [ ] Verify zero-value cache read/write sections stay hidden when appropriate.
### Acceptance criteria
- [ ] CLI and gateway both show the landed account-limits section correctly.
- [ ] Targeted tests pass.
## #959 — [QA] Verify OpenCode-Go curated catalog additions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/959
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `4fea1769d` — opencode-go: add Kimi K2.6 and Qwen3.5/3.6 Plus to curated catalog
### Targeted tests
- `tests/hermes_cli/test_opencode_go_in_model_list.py`
### Tasks
- [ ] With valid OpenCode-Go credentials, open `hermes model`.
- [ ] Verify Kimi K2.6 appears.
- [ ] Verify Qwen 3.5 Plus and 3.6 Plus appear.
- [ ] Unset credentials and verify the provider/catalog hides correctly.
### Acceptance criteria
- [ ] New curated models are present when credentials exist.
- [ ] Catalog visibility still respects credential gating.
- [ ] Targeted tests pass.
## #960 — [QA] Verify patch 'did you mean?' suggestions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/960
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `15abf4ed8` — add `did you mean?` feedback when patch fails to match
- `5e6427a42` — gate it to true no-match cases and extend to v4a / skill_manage
### Targeted tests
- `tests/tools/test_fuzzy_match.py`
### Tasks
- [ ] Intentionally run a replace/patch with a near-miss `old_string`.
- [ ] Verify the tool suggests a useful nearby line/context.
- [ ] Verify suggestions only appear on true no-match failures.
- [ ] Verify the behavior also works via file tools, v4a patching, and skill_manage.
### Acceptance criteria
- [ ] Suggestion quality is helpful, not noisy.
- [ ] Suggestions are correctly gated to no-match cases.
- [ ] Targeted tests pass.
## #961 — [QA] Verify web dashboard update/restart action buttons
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/961
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `fc21c1420` — add buttons to update Hermes and restart gateway
### Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
### Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment and verify running/output/success-or-failure states render.
- [ ] Click Update Hermes and verify the same action lifecycle.
- [ ] Verify the page remains responsive while actions are running.
### Acceptance criteria
- [ ] Both action buttons are present and wired.
- [ ] Action status polling and result rendering work end-to-end.
- [ ] Manual evidence is attached in the issue.
## #962 — [QA] Verify hardcoded-home path guard on burn/921 branch
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/962
### Branch / checkout
- Validate specifically on `burn/921-poka-yoke-hardcoded-paths` (not upstream/main).
### Commits
- `5dcb90531` — Poka-yoke: prevent hardcoded home-directory paths
### Targeted tests
- `tests/test_path_guard.py`
### Tasks
- [ ] Verify hardcoded `/Users/...` paths are rejected.
- [ ] Verify hardcoded `~/.hermes/...` paths are rejected in guarded contexts.
- [ ] Verify valid relative paths still pass.
- [ ] Verify appropriate absolute paths still pass where intended.
- [ ] Verify linting catches violations in non-test files.
### Acceptance criteria
- [ ] Guard blocks the dangerous patterns and preserves allowed ones.
- [ ] Targeted tests pass.

View File

@@ -1,301 +0,0 @@
#!/usr/bin/env python3
"""Build a morning review packet from a Gitea epic and its child QA issues.
This script fetches a parent epic plus its sub-issues, extracts the structured
sections from each QA issue body, and renders a single markdown packet suitable
for morning review.
Usage:
python scripts/morning_review_packet.py --epic-number 949
python scripts/morning_review_packet.py --epic-number 949 --children 950-962
python scripts/morning_review_packet.py --epic-number 949 --output docs/review_packets/hermes-harness-2026-04-21.md
"""
from __future__ import annotations
import argparse
import json
import os
import re
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable
DEFAULT_BASE_URL = "https://forge.alexanderwhitestone.com"
DEFAULT_OWNER = "Timmy_Foundation"
DEFAULT_REPO = "hermes-agent"
DEFAULT_TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
@dataclass(frozen=True)
class CommitEvidence:
sha: str
summary: str
@dataclass
class ReviewIssue:
number: int
title: str
state: str
url: str
comments: int = 0
parent_issue: int | None = None
checkout_notes: list[str] = field(default_factory=list)
commits: list[CommitEvidence] = field(default_factory=list)
targeted_tests: list[str] = field(default_factory=list)
files_touched: list[str] = field(default_factory=list)
tasks: list[str] = field(default_factory=list)
acceptance_criteria: list[str] = field(default_factory=list)
def parse_issue_number_spec(spec: str) -> list[int]:
"""Parse a comma-separated issue list like ``950-952,955,962``."""
numbers: list[int] = []
seen: set[int] = set()
for chunk in (part.strip() for part in spec.split(",")):
if not chunk:
continue
if "-" in chunk:
start_str, end_str = (part.strip() for part in chunk.split("-", 1))
start = int(start_str)
end = int(end_str)
if end < start:
raise ValueError(f"Invalid descending issue range: {chunk}")
for number in range(start, end + 1):
if number not in seen:
numbers.append(number)
seen.add(number)
else:
number = int(chunk)
if number not in seen:
numbers.append(number)
seen.add(number)
return numbers
def _parse_sections(body: str) -> dict[str, list[str]]:
sections: dict[str, list[str]] = {}
current: str | None = None
for raw_line in body.splitlines():
line = raw_line.rstrip()
if line.startswith("## "):
current = line[3:].strip()
sections[current] = []
continue
if current is not None:
sections[current].append(line)
return sections
def _clean_bullet(line: str) -> str | None:
stripped = line.strip()
if not stripped:
return None
stripped = re.sub(r"^-\s*\[(?: |x|X)\]\s*", "", stripped)
stripped = re.sub(r"^-\s*", "", stripped)
return stripped.strip() or None
def _extract_bullets(lines: Iterable[str]) -> list[str]:
items: list[str] = []
for line in lines:
cleaned = _clean_bullet(line)
if cleaned:
items.append(cleaned)
return items
def _extract_parent_issue(body: str, sections: dict[str, list[str]]) -> int | None:
parent_lines = sections.get("Parent", [])
for line in parent_lines:
match = re.search(r"#(\d+)", line)
if match:
return int(match.group(1))
match = re.search(r"Linked to Epic\s+#(\d+)", body, flags=re.IGNORECASE)
if match:
return int(match.group(1))
return None
def _extract_commits(lines: Iterable[str]) -> list[CommitEvidence]:
commits: list[CommitEvidence] = []
for item in _extract_bullets(lines):
match = re.match(r"`([^`]+)`\s*(.*)", item)
if match:
commits.append(CommitEvidence(sha=match.group(1).strip(), summary=match.group(2).strip()))
else:
commits.append(CommitEvidence(sha="", summary=item))
return commits
def _strip_backticks(items: Iterable[str]) -> list[str]:
cleaned: list[str] = []
for item in items:
cleaned.append(item.replace("`", "").strip())
return cleaned
def discover_child_issue_numbers(epic_body: str) -> list[int]:
"""Discover sub-issue numbers from an epic body."""
sections = _parse_sections(epic_body)
sub_lines = sections.get("Sub-issues")
if not sub_lines:
return []
numbers: list[int] = []
seen: set[int] = set()
for line in sub_lines:
for match in re.finditer(r"#(\d+)", line):
number = int(match.group(1))
if number not in seen:
numbers.append(number)
seen.add(number)
return numbers
def parse_child_issue(issue: dict) -> ReviewIssue:
body = issue.get("body") or ""
sections = _parse_sections(body)
commit_lines = sections.get("Commits landed today", []) or sections.get("Commit landed today", [])
return ReviewIssue(
number=int(issue["number"]),
title=issue.get("title") or "",
state=(issue.get("state") or "unknown").lower(),
url=issue.get("html_url") or issue.get("url") or "",
comments=int(issue.get("comments") or 0),
parent_issue=_extract_parent_issue(body, sections),
checkout_notes=_extract_bullets(sections.get("Branch / checkout", [])),
commits=_extract_commits(commit_lines),
targeted_tests=_strip_backticks(_extract_bullets(sections.get("Targeted tests", []))),
files_touched=_strip_backticks(_extract_bullets(sections.get("Files touched", []))),
tasks=_extract_bullets(sections.get("Tasks", [])),
acceptance_criteria=_extract_bullets(sections.get("Acceptance Criteria", [])),
)
def build_packet_markdown(epic_issue: dict, child_issues: list[ReviewIssue]) -> str:
title = epic_issue.get("title") or f"Epic #{epic_issue.get('number')}"
url = epic_issue.get("html_url") or epic_issue.get("url") or ""
body = epic_issue.get("body") or ""
children = sorted(child_issues, key=lambda item: item.number)
lines: list[str] = []
lines.append("# Morning Review Packet")
lines.append("")
lines.append(f"Source epic: [{title}]({url})")
lines.append("")
lines.append("## Epic context")
lines.append("")
lines.append(title)
lines.append("")
for line in body.splitlines():
if line.strip():
lines.append(line)
else:
lines.append("")
lines.append("")
lines.append("## Summary")
lines.append("")
lines.append("| Issue | State | Commits | Tests |")
lines.append("| --- | --- | --- | --- |")
for child in children:
lines.append(
f"| #{child.number} | {child.state} | {len(child.commits)} | {len(child.targeted_tests)} |"
)
lines.append("")
for child in children:
lines.append(f"## #{child.number}{child.title}")
lines.append("")
lines.append(f"State: {child.state}")
lines.append(f"URL: {child.url}")
lines.append("")
if child.checkout_notes:
lines.append("### Branch / checkout")
for note in child.checkout_notes:
lines.append(f"- {note}")
lines.append("")
if child.commits:
lines.append("### Commits")
for commit in child.commits:
if commit.sha:
lines.append(f"- `{commit.sha}` — {commit.summary}")
else:
lines.append(f"- {commit.summary}")
lines.append("")
if child.targeted_tests:
lines.append("### Targeted tests")
for test_path in child.targeted_tests:
lines.append(f"- `{test_path}`")
lines.append("")
if child.files_touched:
lines.append("### Files touched")
for file_path in child.files_touched:
lines.append(f"- `{file_path}`")
lines.append("")
if child.tasks:
lines.append("### Tasks")
for task in child.tasks:
lines.append(f"- [ ] {task}")
lines.append("")
if child.acceptance_criteria:
lines.append("### Acceptance criteria")
for item in child.acceptance_criteria:
lines.append(f"- [ ] {item}")
lines.append("")
return "\n".join(lines).rstrip() + "\n"
def _resolve_token(explicit_token: str | None = None) -> str:
if explicit_token:
return explicit_token.strip()
env_token = os.getenv("GITEA_TOKEN")
if env_token:
return env_token.strip()
if DEFAULT_TOKEN_PATH.exists():
return DEFAULT_TOKEN_PATH.read_text().strip()
raise FileNotFoundError(f"No Gitea token found. Set GITEA_TOKEN or create {DEFAULT_TOKEN_PATH}")
def fetch_issue(base_url: str, owner: str, repo: str, number: int, token: str) -> dict:
url = f"{base_url.rstrip('/')}/api/v1/repos/{owner}/{repo}/issues/{number}"
request = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
with urllib.request.urlopen(request, timeout=30) as response:
return json.loads(response.read().decode())
def collect_child_issues(base_url: str, owner: str, repo: str, epic_issue: dict, token: str, children_spec: str | None = None) -> list[dict]:
numbers = parse_issue_number_spec(children_spec) if children_spec else discover_child_issue_numbers(epic_issue.get("body") or "")
return [fetch_issue(base_url, owner, repo, number, token) for number in numbers]
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Build a markdown morning review packet from a Gitea epic")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument("--owner", default=DEFAULT_OWNER)
parser.add_argument("--repo", default=DEFAULT_REPO)
parser.add_argument("--epic-number", type=int, required=True)
parser.add_argument("--children", help="Explicit issue list/ranges, e.g. 950-962")
parser.add_argument("--token", help="Gitea token (defaults to GITEA_TOKEN or ~/.config/gitea/token)")
parser.add_argument("--output", help="Write markdown packet to this path instead of stdout")
args = parser.parse_args(argv)
token = _resolve_token(args.token)
epic_issue = fetch_issue(args.base_url, args.owner, args.repo, args.epic_number, token)
child_issue_dicts = collect_child_issues(args.base_url, args.owner, args.repo, epic_issue, token, args.children)
packet = build_packet_markdown(epic_issue, [parse_child_issue(issue) for issue in child_issue_dicts])
if args.output:
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(packet)
else:
print(packet, end="")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,162 +0,0 @@
from pathlib import Path
import sys
SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts"
sys.path.insert(0, str(SCRIPT_DIR))
import morning_review_packet as mrp
EPIC_BODY = """Source: git log on upstream/main since 2026-04-21 00:00 EDT.
## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
## Sub-issues
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [x] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
"""
CHILD_BODY_PLURAL = """## Parent
#949
## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
## Commits landed today
- `b11753879` attribution default_headers for ai-gateway provider
- `700437440` curated picker with live pricing
## Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
## Tasks
- [ ] Verify the picker ordering.
- [ ] Verify attribution headers.
## Acceptance Criteria
- [ ] Picker shows AI Gateway prominently.
- [ ] Headers appear on OpenRouter calls.
"""
CHILD_BODY_SINGULAR = """## Parent
#949
## Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
## Commit landed today
- `fc21c1420` add buttons to update Hermes and restart gateway
## Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
## Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment.
"""
def test_discover_child_issue_numbers_from_epic_body():
assert mrp.discover_child_issue_numbers(EPIC_BODY) == [950, 951, 962]
def test_parse_issue_number_spec_supports_ranges_and_lists():
assert mrp.parse_issue_number_spec("950-952,955,962") == [950, 951, 952, 955, 962]
def test_parse_child_issue_extracts_structured_sections():
issue = {
"number": 950,
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
"state": "open",
"html_url": "https://forge.example/950",
"comments": 0,
"body": CHILD_BODY_PLURAL,
}
parsed = mrp.parse_child_issue(issue)
assert parsed.number == 950
assert parsed.parent_issue == 949
assert parsed.checkout_notes == ["Validate on `upstream/main` or an equivalent synced checkout."]
assert [c.sha for c in parsed.commits] == ["b11753879", "700437440"]
assert parsed.targeted_tests == [
"tests/hermes_cli/test_ai_gateway_models.py",
"tests/run_agent/test_provider_attribution_headers.py",
]
assert parsed.tasks == [
"Verify the picker ordering.",
"Verify attribution headers.",
]
assert parsed.acceptance_criteria == [
"Picker shows AI Gateway prominently.",
"Headers appear on OpenRouter calls.",
]
def test_parse_child_issue_handles_singular_commit_heading_and_files_touched():
issue = {
"number": 961,
"title": "[QA] Verify web dashboard update/restart action buttons",
"state": "closed",
"html_url": "https://forge.example/961",
"comments": 16,
"body": CHILD_BODY_SINGULAR,
}
parsed = mrp.parse_child_issue(issue)
assert [c.sha for c in parsed.commits] == ["fc21c1420"]
assert parsed.files_touched == [
"web/src/pages/StatusPage.tsx",
"web/src/lib/api.ts",
"web/src/i18n/en.ts",
]
assert parsed.tasks == [
"Open the Web UI status page and verify both buttons are present.",
"Click Restart Gateway in a safe environment.",
]
def test_build_packet_markdown_renders_summary_and_details():
epic_issue = {
"number": 949,
"title": "EPIC: Morning review packet — Hermes harness features landed 2026-04-21",
"state": "open",
"html_url": "https://forge.example/949",
"body": EPIC_BODY,
}
child_a = mrp.parse_child_issue({
"number": 950,
"title": "[QA] Verify AI Gateway provider UX + attribution headers",
"state": "open",
"html_url": "https://forge.example/950",
"comments": 0,
"body": CHILD_BODY_PLURAL,
})
child_b = mrp.parse_child_issue({
"number": 961,
"title": "[QA] Verify web dashboard update/restart action buttons",
"state": "closed",
"html_url": "https://forge.example/961",
"comments": 16,
"body": CHILD_BODY_SINGULAR,
})
markdown = mrp.build_packet_markdown(epic_issue, [child_a, child_b])
assert "# Morning Review Packet" in markdown
assert "EPIC: Morning review packet — Hermes harness features landed 2026-04-21" in markdown
assert "| #950 | open | 2 | 2 |" in markdown
assert "| #961 | closed | 1 | 0 |" in markdown
assert "## #950 — [QA] Verify AI Gateway provider UX + attribution headers" in markdown
assert "## #961 — [QA] Verify web dashboard update/restart action buttons" in markdown
assert "`b11753879` — attribution default_headers for ai-gateway provider" in markdown
assert "`web/src/pages/StatusPage.tsx`" in markdown

View File

@@ -11,12 +11,14 @@ import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from vision_benchmark import (
analyze_with_model,
compute_ocr_accuracy,
compute_description_completeness,
compute_structural_accuracy,
aggregate_results,
to_markdown,
generate_sample_dataset,
load_dataset,
MODELS,
EVAL_PROMPTS,
)
@@ -197,6 +199,71 @@ class TestMarkdown:
class TestDataset:
def test_repo_dataset_uses_local_image_paths(self):
dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
dataset = json.loads(dataset_path.read_text())
assert dataset, "benchmark dataset should not be empty"
assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
images_dir = tmp_path / "images"
images_dir.mkdir()
image_path = images_dir / "sample.png"
image_path.write_bytes(b"png-bytes")
dataset_path = tmp_path / "dataset.json"
dataset_path.write_text(json.dumps([
{
"id": "sample",
"url": "images/sample.png",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
}
]))
loaded = load_dataset(str(dataset_path))
assert loaded[0]["url"] == str(image_path.resolve())
@pytest.mark.asyncio
async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
image_path = tmp_path / "tiny.png"
image_path.write_bytes(
bytes.fromhex(
"89504E470D0A1A0A"
"0000000D49484452000000010000000108060000001F15C489"
"0000000D49444154789C6360000002000154A24F5D00000000"
"49454E44AE426082"
)
)
fake_response = MagicMock()
fake_response.raise_for_status.return_value = None
fake_response.json.return_value = {
"choices": [{"message": {"content": "Looks like a tiny image."}}],
"usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
}
fake_client = MagicMock()
fake_client.post = AsyncMock(return_value=fake_response)
fake_ctx = MagicMock()
fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
fake_ctx.__aexit__ = AsyncMock(return_value=None)
monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
with patch("httpx.AsyncClient", return_value=fake_ctx):
result = await analyze_with_model(
str(image_path),
"Describe this image",
{"provider": "openrouter", "model_id": "fake/model"},
)
assert result["success"] is True
sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
assert sent_url.startswith("data:image/png;base64,")
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4