fix: vendor vision benchmark fixtures (#868 )

2026-04-22 11:37:04 -04:00
33 changed files with 353 additions and 596 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -1,4 +1,4 @@
-"""Shared auxiliary client router for side tasks.
+from agent.telemetry_logger import log_token_usage\n"""Shared auxiliary client router for side tasks.

 Provides a single resolution chain so every consumer (context compression,
 session search, web extraction, vision analysis, browser vision) picks up
@@ -38,7 +38,6 @@ import json
 import logging
 import os
 import threading
-from agent.telemetry_logger import log_token_usage
 import time
 from pathlib import Path  # noqa: F401 — used by test mocks
 from types import SimpleNamespace
@@ -123,16 +122,6 @@ _OR_HEADERS = {
    "X-OpenRouter-Categories": "productivity,cli-agent",
 }

-# Vercel AI Gateway app attribution headers. HTTP-Referer maps to
-# referrerUrl and X-Title maps to appName in the gateway analytics.
-from hermes_cli import __version__ as _HERMES_VERSION
-
-_AI_GATEWAY_HEADERS = {
-    "HTTP-Referer": "https://hermes-agent.nousresearch.com",
-    "X-Title": "Hermes Agent",
-    "User-Agent": f"HermesAgent/{_HERMES_VERSION}",
-}
-
 # Nous Portal extra_body for product attribution.
 # Callers should pass this as extra_body in chat.completions.create()
 # when the auxiliary client is backed by Nous Portal.
@@ -407,8 +396,7 @@ class _CodexCompletionsAdapter:
                    prompt_tokens=getattr(resp_usage, "input_tokens", 0),
                    completion_tokens=getattr(resp_usage, "output_tokens", 0),
                    total_tokens=getattr(resp_usage, "total_tokens", 0),
-                )
-                log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
+                )\n        log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
        except Exception as exc:
            logger.debug("Codex auxiliary Responses API call failed: %s", exc)
            raise
@@ -541,8 +529,7 @@ class _AnthropicCompletionsAdapter:
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=total_tokens,
-            )
-            log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
+            )\n    log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)

        choice = SimpleNamespace(
            index=0,
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -1,194 +1,354 @@
 [
  {
    "id": "screenshot_github_home",
-    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+    "url": "test_images/screenshot_github_home.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "logo", "mark"],
+    "expected_keywords": [
+      "github",
+      "logo",
+      "mark"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_mermaid_flow",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
+    "url": "test_images/diagram_mermaid_flow.png",
    "category": "diagram",
-    "expected_keywords": ["flow", "diagram", "process"],
+    "expected_keywords": [
+      "flow",
+      "diagram",
+      "process"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_1",
-    "url": "https://picsum.photos/seed/vision1/400/300",
+    "url": "test_images/photo_random_1.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_2",
-    "url": "https://picsum.photos/seed/vision2/400/300",
+    "url": "test_images/photo_random_2.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_simple_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
+    "url": "test_images/chart_simple_bar.png",
    "category": "chart",
-    "expected_keywords": ["bar", "chart", "revenue"],
+    "expected_keywords": [
+      "bar",
+      "chart",
+      "revenue"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "chart_pie",
-    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
+    "url": "test_images/chart_pie.png",
    "category": "chart",
-    "expected_keywords": ["pie", "chart", "percentage"],
+    "expected_keywords": [
+      "pie",
+      "chart",
+      "percentage"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_org_chart",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_org_chart.png",
    "category": "diagram",
-    "expected_keywords": ["organization", "hierarchy", "chart"],
+    "expected_keywords": [
+      "organization",
+      "hierarchy",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_terminal",
-    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
+    "url": "test_images/screenshot_terminal.png",
    "category": "screenshot",
-    "expected_keywords": ["terminal", "command", "output"],
+    "expected_keywords": [
+      "terminal",
+      "command",
+      "output"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_3",
-    "url": "https://picsum.photos/seed/vision3/400/300",
+    "url": "test_images/photo_random_3.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_line",
-    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
+    "url": "test_images/chart_line.png",
    "category": "chart",
-    "expected_keywords": ["line", "chart", "temperature"],
+    "expected_keywords": [
+      "line",
+      "chart",
+      "temperature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_sequence",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_sequence.png",
    "category": "diagram",
-    "expected_keywords": ["sequence", "interaction", "message"],
+    "expected_keywords": [
+      "sequence",
+      "interaction",
+      "message"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_4",
-    "url": "https://picsum.photos/seed/vision4/400/300",
+    "url": "test_images/photo_random_4.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_webpage",
-    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
+    "url": "test_images/screenshot_webpage.png",
    "category": "screenshot",
-    "expected_keywords": ["github", "page", "web"],
+    "expected_keywords": [
+      "github",
+      "page",
+      "web"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_radar",
-    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
+    "url": "test_images/chart_radar.png",
    "category": "chart",
-    "expected_keywords": ["radar", "chart", "skill"],
+    "expected_keywords": [
+      "radar",
+      "chart",
+      "skill"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "photo_random_5",
-    "url": "https://picsum.photos/seed/vision5/400/300",
+    "url": "test_images/photo_random_5.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "diagram_class",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_class.png",
    "category": "diagram",
-    "expected_keywords": ["class", "object", "attribute"],
+    "expected_keywords": [
+      "class",
+      "object",
+      "attribute"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_doughnut",
-    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
+    "url": "test_images/chart_doughnut.png",
    "category": "chart",
-    "expected_keywords": ["doughnut", "chart", "device"],
+    "expected_keywords": [
+      "doughnut",
+      "chart",
+      "device"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "photo_random_6",
-    "url": "https://picsum.photos/seed/vision6/400/300",
+    "url": "test_images/photo_random_6.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "screenshot_error",
-    "url": "https://http.cat/404.jpg",
+    "url": "test_images/screenshot_error.png",
    "category": "screenshot",
-    "expected_keywords": ["404", "error", "cat"],
+    "expected_keywords": [
+      "404",
+      "error",
+      "cat"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": true
+    }
  },
  {
    "id": "diagram_network",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "url": "test_images/diagram_network.png",
    "category": "diagram",
-    "expected_keywords": ["network", "node", "connection"],
+    "expected_keywords": [
+      "network",
+      "node",
+      "connection"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_7",
-    "url": "https://picsum.photos/seed/vision7/400/300",
+    "url": "test_images/photo_random_7.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "chart_stacked_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
+    "url": "test_images/chart_stacked_bar.png",
    "category": "chart",
-    "expected_keywords": ["stacked", "bar", "chart"],
+    "expected_keywords": [
+      "stacked",
+      "bar",
+      "chart"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+    "expected_structure": {
+      "min_length": 50,
+      "min_sentences": 2,
+      "has_numbers": true
+    }
  },
  {
    "id": "screenshot_dashboard",
-    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
+    "url": "test_images/screenshot_dashboard.png",
    "category": "screenshot",
-    "expected_keywords": ["search", "code", "feature"],
+    "expected_keywords": [
+      "search",
+      "code",
+      "feature"
+    ],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  },
  {
    "id": "photo_random_8",
-    "url": "https://picsum.photos/seed/vision8/400/300",
+    "url": "test_images/photo_random_8.png",
    "category": "photo",
    "expected_keywords": [],
    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+    "expected_structure": {
+      "min_length": 30,
+      "min_sentences": 1,
+      "has_numbers": false
+    }
  }
 ]
--- a/benchmarks/test_images/chart_doughnut.png
+++ b/benchmarks/test_images/chart_doughnut.png
--- a/benchmarks/test_images/chart_line.png
+++ b/benchmarks/test_images/chart_line.png
--- a/benchmarks/test_images/chart_pie.png
+++ b/benchmarks/test_images/chart_pie.png
--- a/benchmarks/test_images/chart_radar.png
+++ b/benchmarks/test_images/chart_radar.png
--- a/benchmarks/test_images/chart_simple_bar.png
+++ b/benchmarks/test_images/chart_simple_bar.png
--- a/benchmarks/test_images/chart_stacked_bar.png
+++ b/benchmarks/test_images/chart_stacked_bar.png
--- a/benchmarks/test_images/diagram_class.png
+++ b/benchmarks/test_images/diagram_class.png
--- a/benchmarks/test_images/diagram_mermaid_flow.png
+++ b/benchmarks/test_images/diagram_mermaid_flow.png
--- a/benchmarks/test_images/diagram_network.png
+++ b/benchmarks/test_images/diagram_network.png
--- a/benchmarks/test_images/diagram_org_chart.png
+++ b/benchmarks/test_images/diagram_org_chart.png
--- a/benchmarks/test_images/diagram_sequence.png
+++ b/benchmarks/test_images/diagram_sequence.png
--- a/benchmarks/test_images/photo_random_1.png
+++ b/benchmarks/test_images/photo_random_1.png
--- a/benchmarks/test_images/photo_random_2.png
+++ b/benchmarks/test_images/photo_random_2.png
--- a/benchmarks/test_images/photo_random_3.png
+++ b/benchmarks/test_images/photo_random_3.png
--- a/benchmarks/test_images/photo_random_4.png
+++ b/benchmarks/test_images/photo_random_4.png
--- a/benchmarks/test_images/photo_random_5.png
+++ b/benchmarks/test_images/photo_random_5.png
--- a/benchmarks/test_images/photo_random_6.png
+++ b/benchmarks/test_images/photo_random_6.png
--- a/benchmarks/test_images/photo_random_7.png
+++ b/benchmarks/test_images/photo_random_7.png
--- a/benchmarks/test_images/photo_random_8.png
+++ b/benchmarks/test_images/photo_random_8.png
--- a/benchmarks/test_images/screenshot_dashboard.png
+++ b/benchmarks/test_images/screenshot_dashboard.png
--- a/benchmarks/test_images/screenshot_error.png
+++ b/benchmarks/test_images/screenshot_error.png
--- a/benchmarks/test_images/screenshot_github_home.png
+++ b/benchmarks/test_images/screenshot_github_home.png
--- a/benchmarks/test_images/screenshot_terminal.png
+++ b/benchmarks/test_images/screenshot_terminal.png
--- a/benchmarks/test_images/screenshot_webpage.png
+++ b/benchmarks/test_images/screenshot_webpage.png
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -11,17 +11,19 @@ Usage:

    # Single image test
    python benchmarks/vision_benchmark.py --url https://example.com/image.png
+    python benchmarks/vision_benchmark.py --url benchmarks/test_images/photo_random_1.png

    # Generate test report
    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json

-Test image dataset: benchmarks/test_images.json (50-100 diverse images)
+Test image dataset: benchmarks/test_images.json (committed local fixtures under benchmarks/test_images/)
 """

 import argparse
 import asyncio
 import base64
 import json
+import mimetypes
 import os
 import statistics
 import sys
@@ -67,6 +69,28 @@ EVAL_PROMPTS = {
 # ---------------------------------------------------------------------------


+def _is_remote_image_source(image_source: str) -> bool:
+    return image_source.startswith(("http://", "https://", "data:", "file://"))
+
+
+def _image_source_to_payload_url(image_source: str) -> str:
+    """Convert local image paths into data URLs; keep remote URLs unchanged."""
+    if image_source.startswith(("http://", "https://", "data:")):
+        return image_source
+
+    resolved = image_source[len("file://"):] if image_source.startswith("file://") else image_source
+    local_path = Path(os.path.expanduser(resolved)).resolve()
+    if not local_path.is_file():
+        return image_source
+
+    mime_type, _ = mimetypes.guess_type(str(local_path))
+    if not mime_type:
+        mime_type = "application/octet-stream"
+
+    encoded = base64.b64encode(local_path.read_bytes()).decode("ascii")
+    return f"data:{mime_type};base64,{encoded}"
+
+
 async def analyze_with_model(
    image_url: str,
    prompt: str,
@@ -84,6 +108,8 @@ async def analyze_with_model(
    """
    import httpx

+    image_payload_url = _image_source_to_payload_url(image_url)
+
    provider = model_config["provider"]
    model_id = model_config["model_id"]

@@ -93,7 +119,7 @@ async def analyze_with_model(
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "image_url", "image_url": {"url": image_payload_url}},
            ],
        }
    ]
@@ -570,8 +596,18 @@ def generate_sample_dataset() -> List[dict]:

 def load_dataset(path: str) -> List[dict]:
    """Load test dataset from JSON file."""
-    with open(path) as f:
-        return json.load(f)
+    dataset_path = Path(path).resolve()
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+
+    base_dir = dataset_path.parent
+    for image in dataset:
+        image_url = image.get("url")
+        if not image_url or _is_remote_image_source(image_url):
+            continue
+        image["url"] = str((base_dir / image_url).resolve())
+
+    return dataset


 # ---------------------------------------------------------------------------
@@ -582,7 +618,7 @@ def load_dataset(path: str) -> List[dict]:
 async def main():
    parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
    parser.add_argument("--images", help="Path to test images JSON file")
-    parser.add_argument("--url", help="Single image URL to test")
+    parser.add_argument("--url", help="Single image URL or local file path to test")
    parser.add_argument("--category", default="photo", help="Category for single URL")
    parser.add_argument("--output", default=None, help="Output JSON file")
    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -168,7 +168,7 @@ import time as _time
 from datetime import datetime

 from hermes_cli import __version__, __release_date__
-from hermes_constants import AI_GATEWAY_BASE_URL, OPENROUTER_BASE_URL
+from hermes_constants import OPENROUTER_BASE_URL

 logger = logging.getLogger(__name__)

@@ -1112,8 +1112,6 @@ def select_provider_and_model(args=None):
    # Step 2: Provider-specific setup + model selection
    if selected_provider == "openrouter":
        _model_flow_openrouter(config, current_model)
-    elif selected_provider == "ai-gateway":
-        _model_flow_ai_gateway(config, current_model)
    elif selected_provider == "nous":
        _model_flow_nous(config, current_model, args=args)
    elif selected_provider == "openai-codex":
@@ -1269,55 +1267,6 @@ def _model_flow_openrouter(config, current_model=""):
        print("No change.")


-def _model_flow_ai_gateway(config, current_model=""):
-    """Vercel AI Gateway provider: ensure API key, then pick model with pricing."""
-    from hermes_cli.auth import _prompt_model_selection, _save_model_choice, deactivate_provider
-    from hermes_cli.config import get_env_value, save_env_value
-    from hermes_cli.models import ai_gateway_model_ids, get_pricing_for_provider
-
-    api_key = get_env_value("AI_GATEWAY_API_KEY")
-    if not api_key:
-        print("No Vercel AI Gateway API key configured.")
-        print("Create API key here: https://vercel.com/d?to=%2F%5Bteam%5D%2F%7E%2Fai-gateway&title=AI+Gateway")
-        print("Add a payment method to get $5 in free credits.")
-        print()
-        try:
-            import getpass
-            key = getpass.getpass("AI Gateway API key (or Enter to cancel): ").strip()
-        except (KeyboardInterrupt, EOFError):
-            print()
-            return
-        if not key:
-            print("Cancelled.")
-            return
-        save_env_value("AI_GATEWAY_API_KEY", key)
-        print("API key saved.")
-        print()
-
-    models_list = ai_gateway_model_ids(force_refresh=True)
-    pricing = get_pricing_for_provider("ai-gateway", force_refresh=True)
-
-    selected = _prompt_model_selection(models_list, current_model=current_model, pricing=pricing)
-    if selected:
-        _save_model_choice(selected)
-
-        from hermes_cli.config import load_config, save_config
-
-        cfg = load_config()
-        model = cfg.get("model")
-        if not isinstance(model, dict):
-            model = {"default": model} if model else {}
-            cfg["model"] = model
-        model["provider"] = "ai-gateway"
-        model["base_url"] = AI_GATEWAY_BASE_URL
-        model["api_mode"] = "chat_completions"
-        save_config(cfg)
-        deactivate_provider()
-        print(f"Default model set to: {selected} (via Vercel AI Gateway)")
-    else:
-        print("No change.")
-
-
 def _model_flow_nous(config, current_model="", args=None):
    """Nous Portal provider: ensure logged in, then pick model."""
    from hermes_cli.auth import (
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -58,28 +58,6 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [

 _openrouter_catalog_cache: list[tuple[str, str]] | None = None

-# Fallback Vercel AI Gateway snapshot used when the live catalog is unavailable.
-# OSS / open-weight models prioritized first, then closed-source by family.
-VERCEL_AI_GATEWAY_MODELS: list[tuple[str, str]] = [
-    ("moonshotai/kimi-k2.6", "recommended"),
-    ("alibaba/qwen3.6-plus", ""),
-    ("zai/glm-5.1", ""),
-    ("minimax/minimax-m2.7", ""),
-    ("anthropic/claude-sonnet-4.6", ""),
-    ("anthropic/claude-opus-4.7", ""),
-    ("anthropic/claude-opus-4.6", ""),
-    ("anthropic/claude-haiku-4.5", ""),
-    ("openai/gpt-5.4", ""),
-    ("openai/gpt-5.4-mini", ""),
-    ("openai/gpt-5.3-codex", ""),
-    ("google/gemini-3.1-pro-preview", ""),
-    ("google/gemini-3-flash", ""),
-    ("google/gemini-3.1-flash-lite-preview", ""),
-    ("xai/grok-4.20-reasoning", ""),
-]
-
-_ai_gateway_catalog_cache: list[tuple[str, str]] | None = None
-

 def _codex_curated_models() -> list[str]:
    """Derive the openai-codex curated list from codex_models.py.
@@ -280,21 +258,18 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "minimax-m2.5",
    ],
    "ai-gateway": [
-        "moonshotai/kimi-k2.6",
-        "alibaba/qwen3.6-plus",
-        "zai/glm-5.1",
-        "minimax/minimax-m2.7",
-        "anthropic/claude-sonnet-4.6",
-        "anthropic/claude-opus-4.7",
        "anthropic/claude-opus-4.6",
+        "anthropic/claude-sonnet-4.6",
+        "anthropic/claude-sonnet-4.5",
        "anthropic/claude-haiku-4.5",
-        "openai/gpt-5.4",
-        "openai/gpt-5.4-mini",
-        "openai/gpt-5.3-codex",
-        "google/gemini-3.1-pro-preview",
+        "openai/gpt-5",
+        "openai/gpt-4.1",
+        "openai/gpt-4.1-mini",
+        "google/gemini-3-pro-preview",
        "google/gemini-3-flash",
-        "google/gemini-3.1-flash-lite-preview",
-        "xai/grok-4.20-reasoning",
+        "google/gemini-2.5-pro",
+        "google/gemini-2.5-flash",
+        "deepseek/deepseek-v3.2",
    ],
    "kilocode": [
        "anthropic/claude-opus-4.6",
@@ -541,7 +516,6 @@ class ProviderEntry(NamedTuple):
 CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("nous",           "Nous Portal",              "Nous Portal (Nous Research subscription)"),
    ProviderEntry("openrouter",     "OpenRouter",               "OpenRouter (100+ models, pay-per-use)"),
-    ProviderEntry("ai-gateway",     "Vercel AI Gateway",        "Vercel AI Gateway (200+ models, $5 free credit, no markup)"),
    ProviderEntry("anthropic",      "Anthropic",                "Anthropic (Claude models — API key or Claude Code)"),
    ProviderEntry("openai-codex",   "OpenAI Codex",             "OpenAI Codex"),
    ProviderEntry("xiaomi",         "Xiaomi MiMo",              "Xiaomi MiMo (MiMo-V2 models — pro, omni, flash)"),
@@ -562,6 +536,7 @@ CANONICAL_PROVIDERS: list[ProviderEntry] = [
    ProviderEntry("kilocode",       "Kilo Code",                "Kilo Code (Kilo Gateway API)"),
    ProviderEntry("opencode-zen",   "OpenCode Zen",             "OpenCode Zen (35+ curated models, pay-as-you-go)"),
    ProviderEntry("opencode-go",    "OpenCode Go",              "OpenCode Go (open models, $10/month subscription)"),
+    ProviderEntry("ai-gateway",     "Vercel AI Gateway",        "Vercel AI Gateway (200+ models, pay-per-use)"),
 ]

 # Derived dicts — used throughout the codebase
@@ -704,90 +679,6 @@ def model_ids(*, force_refresh: bool = False) -> list[str]:



-def _ai_gateway_model_is_free(pricing: Any) -> bool:
-    """Return True if an AI Gateway model has $0 input AND output pricing."""
-    if not isinstance(pricing, dict):
-        return False
-    try:
-        return float(pricing.get("input", "0")) == 0 and float(pricing.get("output", "0")) == 0
-    except (TypeError, ValueError):
-        return False
-
-
-def fetch_ai_gateway_models(
-    timeout: float = 8.0,
-    *,
-    force_refresh: bool = False,
-) -> list[tuple[str, str]]:
-    """Return the curated AI Gateway picker list, refreshed from the live catalog when possible."""
-    global _ai_gateway_catalog_cache
-
-    if _ai_gateway_catalog_cache is not None and not force_refresh:
-        return list(_ai_gateway_catalog_cache)
-
-    from hermes_constants import AI_GATEWAY_BASE_URL
-
-    fallback = list(VERCEL_AI_GATEWAY_MODELS)
-    preferred_ids = [mid for mid, _ in fallback]
-
-    try:
-        req = urllib.request.Request(
-            f"{AI_GATEWAY_BASE_URL.rstrip('/')}/models",
-            headers={"Accept": "application/json"},
-        )
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            payload = json.loads(resp.read().decode())
-    except Exception:
-        return list(_ai_gateway_catalog_cache or fallback)
-
-    live_items = payload.get("data", [])
-    if not isinstance(live_items, list):
-        return list(_ai_gateway_catalog_cache or fallback)
-
-    live_by_id: dict[str, dict[str, Any]] = {}
-    for item in live_items:
-        if not isinstance(item, dict):
-            continue
-        mid = str(item.get("id") or "").strip()
-        if not mid:
-            continue
-        live_by_id[mid] = item
-
-    curated: list[tuple[str, str]] = []
-    for preferred_id in preferred_ids:
-        live_item = live_by_id.get(preferred_id)
-        if live_item is None:
-            continue
-        desc = "free" if _ai_gateway_model_is_free(live_item.get("pricing")) else ""
-        curated.append((preferred_id, desc))
-
-    if not curated:
-        return list(_ai_gateway_catalog_cache or fallback)
-
-    free_moonshot = next(
-        (
-            mid
-            for mid, item in live_by_id.items()
-            if mid.startswith("moonshotai/") and _ai_gateway_model_is_free(item.get("pricing"))
-        ),
-        None,
-    )
-    if free_moonshot:
-        curated = [(mid, desc) for mid, desc in curated if mid != free_moonshot]
-        curated.insert(0, (free_moonshot, "recommended"))
-    else:
-        first_id, _ = curated[0]
-        curated[0] = (first_id, "recommended")
-
-    _ai_gateway_catalog_cache = curated
-    return list(curated)
-
-
-def ai_gateway_model_ids(*, force_refresh: bool = False) -> list[str]:
-    """Return just the AI Gateway model-id strings."""
-    return [mid for mid, _ in fetch_ai_gateway_models(force_refresh=force_refresh)]
-
-
 # ---------------------------------------------------------------------------
 # Pricing helpers — fetch live pricing from OpenRouter-compatible /v1/models
 # ---------------------------------------------------------------------------
@@ -930,51 +821,6 @@ def fetch_models_with_pricing(
    return result


-def fetch_ai_gateway_pricing(
-    timeout: float = 8.0,
-    *,
-    force_refresh: bool = False,
-) -> dict[str, dict[str, str]]:
-    """Fetch Vercel AI Gateway /v1/models and return Hermes-shaped pricing."""
-    from hermes_constants import AI_GATEWAY_BASE_URL
-
-    cache_key = AI_GATEWAY_BASE_URL.rstrip("/")
-    if not force_refresh and cache_key in _pricing_cache:
-        return _pricing_cache[cache_key]
-
-    try:
-        req = urllib.request.Request(
-            f"{cache_key}/models",
-            headers={"Accept": "application/json"},
-        )
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            payload = json.loads(resp.read().decode())
-    except Exception:
-        _pricing_cache[cache_key] = {}
-        return {}
-
-    result: dict[str, dict[str, str]] = {}
-    for item in payload.get("data", []):
-        if not isinstance(item, dict):
-            continue
-        mid = item.get("id")
-        pricing = item.get("pricing")
-        if not (mid and isinstance(pricing, dict)):
-            continue
-        entry: dict[str, str] = {
-            "prompt": str(pricing.get("input", "")),
-            "completion": str(pricing.get("output", "")),
-        }
-        if pricing.get("input_cache_read"):
-            entry["input_cache_read"] = str(pricing["input_cache_read"])
-        if pricing.get("input_cache_write"):
-            entry["input_cache_write"] = str(pricing["input_cache_write"])
-        result[mid] = entry
-
-    _pricing_cache[cache_key] = result
-    return result
-
-
 def _resolve_openrouter_api_key() -> str:
    """Best-effort OpenRouter API key for pricing fetch."""
    return os.getenv("OPENROUTER_API_KEY", "").strip()
@@ -993,7 +839,7 @@ def _resolve_nous_pricing_credentials() -> tuple[str, str]:


 def get_pricing_for_provider(provider: str, *, force_refresh: bool = False) -> dict[str, dict[str, str]]:
-    """Return live pricing for providers that support it (openrouter, ai-gateway, nous)."""
+    """Return live pricing for providers that support it (openrouter, nous)."""
    normalized = normalize_provider(provider)
    if normalized == "openrouter":
        return fetch_models_with_pricing(
@@ -1001,11 +847,11 @@ def get_pricing_for_provider(provider: str, *, force_refresh: bool = False) -> d
            base_url="https://openrouter.ai/api",
            force_refresh=force_refresh,
        )
-    if normalized == "ai-gateway":
-        return fetch_ai_gateway_pricing(force_refresh=force_refresh)
    if normalized == "nous":
        api_key, base_url = _resolve_nous_pricing_credentials()
        if base_url:
+            # Nous base_url typically looks like https://inference-api.nousresearch.com/v1
+            # We need the part before /v1 for our fetch function
            stripped = base_url.rstrip("/")
            if stripped.endswith("/v1"):
                stripped = stripped[:-3]
@@ -1407,7 +1253,9 @@ def provider_model_ids(provider: Optional[str], *, force_refresh: bool = False)
        if live:
            return live
    if normalized == "ai-gateway":
-        return ai_gateway_model_ids()
+        live = _fetch_ai_gateway_models()
+        if live:
+            return live
    if normalized == "custom":
        base_url = _get_custom_base_url()
        if base_url:
--- a/run_agent.py
+++ b/run_agent.py
@@ -908,10 +908,6 @@ class AIAgent:
                        "X-OpenRouter-Title": "Hermes Agent",
                        "X-OpenRouter-Categories": "productivity,cli-agent",
                    }
-                elif "ai-gateway.vercel.sh" in effective_base.lower():
-                    from agent.auxiliary_client import _AI_GATEWAY_HEADERS
-
-                    client_kwargs["default_headers"] = dict(_AI_GATEWAY_HEADERS)
                elif "api.githubcopilot.com" in effective_base.lower():
                    from hermes_cli.models import copilot_default_headers

@@ -4671,13 +4667,11 @@ class AIAgent:
        return True

    def _apply_client_headers_for_base_url(self, base_url: str) -> None:
-        from agent.auxiliary_client import _AI_GATEWAY_HEADERS, _OR_HEADERS
+        from agent.auxiliary_client import _OR_HEADERS

        normalized = (base_url or "").lower()
        if "openrouter" in normalized:
            self._client_kwargs["default_headers"] = dict(_OR_HEADERS)
-        elif "ai-gateway.vercel.sh" in normalized:
-            self._client_kwargs["default_headers"] = dict(_AI_GATEWAY_HEADERS)
        elif "api.githubcopilot.com" in normalized:
            from hermes_cli.models import copilot_default_headers

--- a/tests/hermes_cli/test_ai_gateway_models.py
+++ b/tests/hermes_cli/test_ai_gateway_models.py
@@ -1,222 +0,0 @@
-"""AI Gateway provider UX, live pricing, and model promotion tests."""
-
-from __future__ import annotations
-
-import json
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from hermes_cli import models as models_module
-from hermes_cli.models import (
-    CANONICAL_PROVIDERS,
-    VERCEL_AI_GATEWAY_MODELS,
-    _ai_gateway_model_is_free,
-    ai_gateway_model_ids,
-    fetch_ai_gateway_models,
-    fetch_ai_gateway_pricing,
-    get_pricing_for_provider,
-)
-
-
-def _mock_urlopen(payload):
-    resp = MagicMock()
-    resp.read.return_value = json.dumps(payload).encode()
-    ctx = MagicMock()
-    ctx.__enter__.return_value = resp
-    ctx.__exit__.return_value = False
-    return ctx
-
-
-def _reset_caches():
-    models_module._ai_gateway_catalog_cache = None
-    models_module._pricing_cache.clear()
-
-
-@pytest.fixture
-def config_home(tmp_path, monkeypatch):
-    home = tmp_path / "hermes"
-    home.mkdir()
-    (home / "config.yaml").write_text("model: some-old-model\n")
-    (home / ".env").write_text("")
-    monkeypatch.setenv("HERMES_HOME", str(home))
-    monkeypatch.delenv("AI_GATEWAY_API_KEY", raising=False)
-    monkeypatch.delenv("AI_GATEWAY_BASE_URL", raising=False)
-    return home
-
-
-def test_ai_gateway_provider_is_promoted_near_top_of_picker():
-    slugs = [entry.slug for entry in CANONICAL_PROVIDERS]
-    assert "ai-gateway" in slugs[:3]
-
-
-def test_ai_gateway_pricing_translates_input_output_to_prompt_completion():
-    _reset_caches()
-    payload = {
-        "data": [
-            {
-                "id": "moonshotai/kimi-k2.5",
-                "type": "language",
-                "pricing": {
-                    "input": "0.0000006",
-                    "output": "0.0000025",
-                    "input_cache_read": "0.00000015",
-                    "input_cache_write": "0.0000006",
-                },
-            }
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_pricing(force_refresh=True)
-
-    entry = result["moonshotai/kimi-k2.5"]
-    assert entry["prompt"] == "0.0000006"
-    assert entry["completion"] == "0.0000025"
-    assert entry["input_cache_read"] == "0.00000015"
-    assert entry["input_cache_write"] == "0.0000006"
-
-
-def test_get_pricing_for_provider_supports_ai_gateway():
-    _reset_caches()
-    payload = {
-        "data": [
-            {
-                "id": "moonshotai/kimi-k2.5",
-                "type": "language",
-                "pricing": {"input": "0.0001", "output": "0.0002"},
-            }
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = get_pricing_for_provider("ai-gateway", force_refresh=True)
-    assert result["moonshotai/kimi-k2.5"] == {"prompt": "0.0001", "completion": "0.0002"}
-
-
-def test_ai_gateway_pricing_returns_empty_on_fetch_failure():
-    _reset_caches()
-    with patch("urllib.request.urlopen", side_effect=OSError("network down")):
-        result = fetch_ai_gateway_pricing(force_refresh=True)
-    assert result == {}
-
-
-def test_ai_gateway_pricing_skips_entries_without_pricing_dict():
-    _reset_caches()
-    payload = {
-        "data": [
-            {"id": "x/y", "pricing": None},
-            {"id": "a/b", "pricing": {"input": "0", "output": "0"}},
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_pricing(force_refresh=True)
-    assert "x/y" not in result
-    assert result["a/b"] == {"prompt": "0", "completion": "0"}
-
-
-def test_ai_gateway_free_detector():
-    assert _ai_gateway_model_is_free({"input": "0", "output": "0"}) is True
-    assert _ai_gateway_model_is_free({"input": "0", "output": "0.01"}) is False
-    assert _ai_gateway_model_is_free({"input": "0.01", "output": "0"}) is False
-    assert _ai_gateway_model_is_free(None) is False
-    assert _ai_gateway_model_is_free({"input": "not a number"}) is False
-
-
-def test_fetch_ai_gateway_models_filters_against_live_catalog():
-    _reset_caches()
-    preferred = [mid for mid, _ in VERCEL_AI_GATEWAY_MODELS]
-    live_ids = preferred[:3]
-    payload = {
-        "data": [
-            {"id": mid, "pricing": {"input": "0.001", "output": "0.002"}}
-            for mid in live_ids
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_models(force_refresh=True)
-
-    assert [mid for mid, _ in result] == live_ids
-    assert result[0][1] == "recommended"
-    assert ai_gateway_model_ids(force_refresh=False) == live_ids
-
-
-def test_fetch_ai_gateway_models_tags_free_models():
-    _reset_caches()
-    first_id = VERCEL_AI_GATEWAY_MODELS[0][0]
-    second_id = VERCEL_AI_GATEWAY_MODELS[1][0]
-    payload = {
-        "data": [
-            {"id": first_id, "pricing": {"input": "0.001", "output": "0.002"}},
-            {"id": second_id, "pricing": {"input": "0", "output": "0"}},
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_models(force_refresh=True)
-
-    by_id = dict(result)
-    assert by_id[first_id] == "recommended"
-    assert by_id[second_id] == "free"
-
-
-def test_free_moonshot_model_auto_promoted_to_top_even_if_not_curated():
-    _reset_caches()
-    first_curated = VERCEL_AI_GATEWAY_MODELS[0][0]
-    unlisted_free_moonshot = "moonshotai/kimi-coder-free-preview"
-    payload = {
-        "data": [
-            {"id": first_curated, "pricing": {"input": "0.001", "output": "0.002"}},
-            {"id": unlisted_free_moonshot, "pricing": {"input": "0", "output": "0"}},
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_models(force_refresh=True)
-
-    assert result[0] == (unlisted_free_moonshot, "recommended")
-    assert any(mid == first_curated for mid, _ in result)
-
-
-def test_paid_moonshot_does_not_get_auto_promoted():
-    _reset_caches()
-    first_curated = VERCEL_AI_GATEWAY_MODELS[0][0]
-    payload = {
-        "data": [
-            {"id": first_curated, "pricing": {"input": "0.001", "output": "0.002"}},
-            {"id": "moonshotai/some-paid-variant", "pricing": {"input": "0.001", "output": "0.002"}},
-        ]
-    }
-    with patch("urllib.request.urlopen", return_value=_mock_urlopen(payload)):
-        result = fetch_ai_gateway_models(force_refresh=True)
-
-    assert result[0][0] == first_curated
-
-
-def test_fetch_ai_gateway_models_falls_back_on_error():
-    _reset_caches()
-    with patch("urllib.request.urlopen", side_effect=OSError("network")):
-        result = fetch_ai_gateway_models(force_refresh=True)
-    assert result == list(VERCEL_AI_GATEWAY_MODELS)
-
-
-def test_ai_gateway_setup_flow_shows_deeplink_and_passes_pricing(config_home, monkeypatch, capsys):
-    from hermes_cli.main import _model_flow_ai_gateway
-    from hermes_cli.config import load_config
-
-    pricing = {"moonshotai/kimi-k2.6": {"prompt": "0", "completion": "0"}}
-    monkeypatch.setenv("HERMES_HOME", str(config_home))
-
-    with patch("getpass.getpass", return_value="vercel-key"), \
-         patch("hermes_cli.models.ai_gateway_model_ids", return_value=["moonshotai/kimi-k2.6"]), \
-         patch("hermes_cli.models.get_pricing_for_provider", return_value=pricing), \
-         patch("hermes_cli.auth._prompt_model_selection", return_value="moonshotai/kimi-k2.6") as prompt_selection, \
-         patch("hermes_cli.auth.deactivate_provider"):
-        _model_flow_ai_gateway(load_config(), "")
-
-    out = capsys.readouterr().out
-    assert "vercel.com/d?to=%2F%5Bteam%5D%2F%7E%2Fai-gateway&title=AI+Gateway" in out
-    assert "free credits" in out.lower()
-    assert prompt_selection.call_args.kwargs["pricing"] == pricing
-
-    import yaml
-    config = yaml.safe_load((config_home / "config.yaml").read_text()) or {}
-    model = config["model"]
-    assert model["provider"] == "ai-gateway"
-    assert model["api_mode"] == "chat_completions"
--- a/tests/run_agent/test_provider_attribution_headers.py
+++ b/tests/run_agent/test_provider_attribution_headers.py
@@ -1,62 +0,0 @@
-"""Attribution default_headers applied per provider via base-URL detection."""
-
-from unittest.mock import MagicMock, patch
-
-from run_agent import AIAgent
-
-
-@patch("run_agent.OpenAI")
-def test_openrouter_base_url_applies_or_headers(mock_openai):
-    mock_openai.return_value = MagicMock()
-    agent = AIAgent(
-        api_key="test-key",
-        base_url="https://openrouter.ai/api/v1",
-        model="test/model",
-        quiet_mode=True,
-        skip_context_files=True,
-        skip_memory=True,
-    )
-
-    agent._apply_client_headers_for_base_url("https://openrouter.ai/api/v1")
-
-    headers = agent._client_kwargs["default_headers"]
-    assert headers["HTTP-Referer"] == "https://hermes-agent.nousresearch.com"
-    assert headers["X-OpenRouter-Title"] == "Hermes Agent"
-
-
-@patch("run_agent.OpenAI")
-def test_ai_gateway_base_url_applies_attribution_headers(mock_openai):
-    mock_openai.return_value = MagicMock()
-    agent = AIAgent(
-        api_key="test-key",
-        base_url="https://openrouter.ai/api/v1",
-        model="test/model",
-        quiet_mode=True,
-        skip_context_files=True,
-        skip_memory=True,
-    )
-
-    agent._apply_client_headers_for_base_url("https://ai-gateway.vercel.sh/v1")
-
-    headers = agent._client_kwargs["default_headers"]
-    assert headers["HTTP-Referer"] == "https://hermes-agent.nousresearch.com"
-    assert headers["X-Title"] == "Hermes Agent"
-    assert headers["User-Agent"].startswith("HermesAgent/")
-
-
-@patch("run_agent.OpenAI")
-def test_unknown_base_url_clears_default_headers(mock_openai):
-    mock_openai.return_value = MagicMock()
-    agent = AIAgent(
-        api_key="test-key",
-        base_url="https://openrouter.ai/api/v1",
-        model="test/model",
-        quiet_mode=True,
-        skip_context_files=True,
-        skip_memory=True,
-    )
-    agent._client_kwargs["default_headers"] = {"X-Stale": "yes"}
-
-    agent._apply_client_headers_for_base_url("https://api.example.com/v1")
-
-    assert "default_headers" not in agent._client_kwargs
--- a/tests/test_vision_benchmark.py
+++ b/tests/test_vision_benchmark.py
@@ -11,12 +11,14 @@ import pytest
 sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))

 from vision_benchmark import (
+    analyze_with_model,
    compute_ocr_accuracy,
    compute_description_completeness,
    compute_structural_accuracy,
    aggregate_results,
    to_markdown,
    generate_sample_dataset,
+    load_dataset,
    MODELS,
    EVAL_PROMPTS,
 )
@@ -197,6 +199,71 @@ class TestMarkdown:


 class TestDataset:
+    def test_repo_dataset_uses_local_image_paths(self):
+        dataset_path = Path(__file__).parent.parent / "benchmarks" / "test_images.json"
+        dataset = json.loads(dataset_path.read_text())
+
+        assert dataset, "benchmark dataset should not be empty"
+        assert all(not entry["url"].startswith(("http://", "https://")) for entry in dataset)
+
+    def test_load_dataset_resolves_relative_local_paths(self, tmp_path):
+        images_dir = tmp_path / "images"
+        images_dir.mkdir()
+        image_path = images_dir / "sample.png"
+        image_path.write_bytes(b"png-bytes")
+
+        dataset_path = tmp_path / "dataset.json"
+        dataset_path.write_text(json.dumps([
+            {
+                "id": "sample",
+                "url": "images/sample.png",
+                "category": "photo",
+                "expected_keywords": [],
+                "expected_structure": {"min_length": 30, "min_sentences": 1},
+            }
+        ]))
+
+        loaded = load_dataset(str(dataset_path))
+
+        assert loaded[0]["url"] == str(image_path.resolve())
+
+    @pytest.mark.asyncio
+    async def test_analyze_with_model_encodes_local_file_as_data_url(self, tmp_path, monkeypatch):
+        image_path = tmp_path / "tiny.png"
+        image_path.write_bytes(
+            bytes.fromhex(
+                "89504E470D0A1A0A"
+                "0000000D49484452000000010000000108060000001F15C489"
+                "0000000D49444154789C6360000002000154A24F5D00000000"
+                "49454E44AE426082"
+            )
+        )
+
+        fake_response = MagicMock()
+        fake_response.raise_for_status.return_value = None
+        fake_response.json.return_value = {
+            "choices": [{"message": {"content": "Looks like a tiny image."}}],
+            "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3},
+        }
+
+        fake_client = MagicMock()
+        fake_client.post = AsyncMock(return_value=fake_response)
+        fake_ctx = MagicMock()
+        fake_ctx.__aenter__ = AsyncMock(return_value=fake_client)
+        fake_ctx.__aexit__ = AsyncMock(return_value=None)
+
+        monkeypatch.setenv("OPENROUTER_API_KEY", "test-key")
+        with patch("httpx.AsyncClient", return_value=fake_ctx):
+            result = await analyze_with_model(
+                str(image_path),
+                "Describe this image",
+                {"provider": "openrouter", "model_id": "fake/model"},
+            )
+
+        assert result["success"] is True
+        sent_url = fake_client.post.await_args.kwargs["json"]["messages"][0]["content"][1]["image_url"]["url"]
+        assert sent_url.startswith("data:image/png;base64,")
+
    def test_sample_dataset_has_entries(self):
        dataset = generate_sample_dataset()
        assert len(dataset) >= 4