fix: local test images for reliable vision benchmark (#868 )

Vision benchmark used external URLs that may become unavailable, causing flaky CI runs. New benchmarks/test_images.json: - 5 test images with local paths, descriptions, expected answers - Categories: shape_color, ocr, counting New benchmarks/test_images/: - 5 generated PNG test images (red_circle, blue_square, green_triangle, text_hello, mixed_shapes) - Deterministic, always available, ~1-3KB each New benchmarks/vision_benchmark.py: - load_test_dataset(): loads test_images.json - verify_images_exist(): checks all images present - run_vision_test(): single test with base64 image encoding - evaluate_response(): checks expected keywords in response - run_benchmark(): full benchmark suite - format_report(): human-readable results - --model, --base-url, --json flags Closes #868
2026-04-15 23:36:58 -04:00
7 changed files with 246 additions and 0 deletions
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -0,0 +1,42 @@
 [
  {
    "id": "img_001",
    "name": "red_circle",
    "path": "benchmarks/test_images/red_circle.png",
    "description": "A red circle on a white background",
    "expected_answer_contains": ["red", "circle"],
    "category": "shape_color"
  },
  {
    "id": "img_002",
    "name": "blue_square",
    "path": "benchmarks/test_images/blue_square.png",
    "description": "A blue square on a white background",
    "expected_answer_contains": ["blue", "square"],
    "category": "shape_color"
  },
  {
    "id": "img_003",
    "name": "green_triangle",
    "path": "benchmarks/test_images/green_triangle.png",
    "description": "A green triangle on a white background",
    "expected_answer_contains": ["green", "triangle"],
    "category": "shape_color"
  },
  {
    "id": "img_004",
    "name": "text_hello",
    "path": "benchmarks/test_images/text_hello.png",
    "description": "An image containing the text 'Hello World'",
    "expected_answer_contains": ["hello", "world"],
    "category": "ocr"
  },
  {
    "id": "img_005",
    "name": "mixed_shapes",
    "path": "benchmarks/test_images/mixed_shapes.png",
    "description": "Multiple colored shapes: red circle, blue square, yellow star",
    "expected_answer_contains": ["red", "blue", "yellow"],
    "category": "counting"
  }
 ]
--- a/benchmarks/test_images/blue_square.png
+++ b/benchmarks/test_images/blue_square.png
--- a/benchmarks/test_images/green_triangle.png
+++ b/benchmarks/test_images/green_triangle.png
--- a/benchmarks/test_images/mixed_shapes.png
+++ b/benchmarks/test_images/mixed_shapes.png
--- a/benchmarks/test_images/red_circle.png
+++ b/benchmarks/test_images/red_circle.png
--- a/benchmarks/test_images/text_hello.png
+++ b/benchmarks/test_images/text_hello.png
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -0,0 +1,204 @@
 #!/usr/bin/env python3
 """Vision benchmark — test model image understanding with local test images.
 Uses locally-stored test images (not external URLs) for reliable CI.
 Usage:
    python3 benchmarks/vision_benchmark.py --model hermes3
    python3 benchmarks/vision_benchmark.py --model qwen2.5 --json
 """
 from __future__ import annotations
 import base64
 import json
 import os
 import sys
 import time
 from pathlib import Path
 from typing import Any, Dict, List
 BENCHMARK_DIR = Path(__file__).resolve().parent
 TEST_IMAGES_FILE = BENCHMARK_DIR / "test_images.json"
 def load_test_dataset() -> List[Dict[str, Any]]:
    """Load test image dataset."""
    if not TEST_IMAGES_FILE.exists():
        raise FileNotFoundError(f"Test dataset not found: {TEST_IMAGES_FILE}")
    with open(TEST_IMAGES_FILE) as f:
        return json.load(f)
 def encode_image_base64(image_path: str) -> str:
    """Encode image as base64 for API call."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode()
 def verify_images_exist(dataset: List[Dict[str, Any]]) -> List[str]:
    """Verify all test images exist locally."""
    missing = []
    for item in dataset:
        path = BENCHMARK_DIR.parent / item["path"]
        if not path.exists():
            missing.append(item["path"])
    return missing
 def run_vision_test(
    image_path: str,
    prompt: str,
    base_url: str = "http://localhost:11434/v1",
    model: str = "",
    api_key: str = "",
    timeout: int = 30,
 ) -> Dict[str, Any]:
    """Run a single vision test against a model."""
    import urllib.request
    img_b64 = encode_image_base64(image_path)
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{img_b64}"},
                },
            ],
        }
    ]
    body = {
        "model": model or "",
        "messages": messages,
        "max_tokens": 200,
    }
    headers = {"Content-Type": "application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    url = f"{base_url.rstrip('/')}/chat/completions"
    t0 = time.monotonic()
    try:
        req = urllib.request.Request(url, data=json.dumps(body).encode(), headers=headers, method="POST")
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            data = json.loads(resp.read())
            elapsed = time.monotonic() - t0
            content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {
                "success": True,
                "response": content,
                "latency_ms": int(elapsed * 1000),
                "model": data.get("model", model),
            }
    except Exception as e:
        return {
            "success": False,
            "response": "",
            "latency_ms": int((time.monotonic() - t0) * 1000),
            "error": str(e),
        }
 def evaluate_response(response: str, expected: List[str]) -> bool:
    """Check if response contains expected keywords."""
    response_lower = response.lower()
    return all(kw.lower() in response_lower for kw in expected)
 def run_benchmark(
    base_url: str = "http://localhost:11434/v1",
    model: str = "",
 ) -> Dict[str, Any]:
    """Run full vision benchmark."""
    dataset = load_test_dataset()
    # Verify images exist
    missing = verify_images_exist(dataset)
    if missing:
        return {"error": f"Missing test images: {missing}", "passed": 0, "total": len(dataset)}
    results = []
    passed = 0
    for item in dataset:
        image_path = str(BENCHMARK_DIR.parent / item["path"])
        prompt = f"What do you see in this image? Describe the shapes and colors."
        result = run_vision_test(image_path, prompt, base_url=base_url, model=model)
        result["test_id"] = item["id"]
        result["test_name"] = item["name"]
        result["category"] = item["category"]
        if result["success"]:
            result["correct"] = evaluate_response(result["response"], item["expected_answer_contains"])
            if result["correct"]:
                passed += 1
        else:
            result["correct"] = False
        results.append(result)
    return {
        "model": model,
        "base_url": base_url,
        "passed": passed,
        "total": len(dataset),
        "success_rate": passed / len(dataset) if dataset else 0,
        "results": results,
    }
 def format_report(benchmark: Dict[str, Any]) -> str:
    """Format benchmark results."""
    if "error" in benchmark:
        return f"ERROR: {benchmark['error']}"
    lines = [
        "Vision Benchmark Results",
        "=" * 40,
        f"Model: {benchmark.get('model', 'unknown')}",
        f"Passed: {benchmark['passed']}/{benchmark['total']} ({benchmark['success_rate']:.0%})",
        "",
    ]
    for r in benchmark.get("results", []):
        icon = "\u2705" if r.get("correct") else "\u274c"
        name = r.get("test_name", "?")
        cat = r.get("category", "?")
        lat = r.get("latency_ms", 0)
        lines.append(f"  {icon} {name} ({cat}) — {lat}ms")
        if not r.get("success"):
            lines.append(f"       Error: {r.get('error', 'unknown')}")
        elif not r.get("correct"):
            lines.append(f"       Got: {r.get('response', '')[:100]}")
    return "\n".join(lines)
 def main():
    import argparse
    parser = argparse.ArgumentParser(description="Vision benchmark")
    parser.add_argument("--base-url", default="http://localhost:11434/v1")
    parser.add_argument("--model", default="")
    parser.add_argument("--json", action="store_true")
    args = parser.parse_args()
    benchmark = run_benchmark(base_url=args.base_url, model=args.model)
    if args.json:
        print(json.dumps(benchmark, indent=2))
    else:
        print(format_report(benchmark))
    return 0 if benchmark.get("success_rate", 0) >= 0.8 else 1
 if __name__ == "__main__":
    sys.exit(main())