Compare commits

...

1 Commits

Author SHA1 Message Date
eed87e454e test: Benchmark Gemma 4 vision accuracy vs current approach (#817)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Successful in 26s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 26s
Tests / e2e (pull_request) Successful in 2m38s
Tests / test (pull_request) Failing after 47m49s
Vision benchmark suite comparing Gemma 4 (google/gemma-4-27b-it) vs
current Gemini 3 Flash Preview (google/gemini-3-flash-preview).

Metrics:
- OCR accuracy (character + word overlap)
- Description completeness (keyword coverage)
- Structural quality (length, sentences, numbers)
- Latency (ms per image)
- Token usage
- Consistency across runs

Features:
- 24 diverse test images (screenshots, diagrams, photos, charts)
- Category-specific evaluation prompts
- Automated verdict with composite scoring
- JSON + markdown report output
- 28 unit tests passing

Usage:
  python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
  python benchmarks/vision_benchmark.py --url https://example.com/img.png
  python benchmarks/vision_benchmark.py --generate-dataset

Closes #817.
2026-04-15 23:02:02 -04:00
3 changed files with 1068 additions and 0 deletions

194
benchmarks/test_images.json Normal file
View File

@@ -0,0 +1,194 @@
[
{
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
}
]

View File

@@ -0,0 +1,635 @@
#!/usr/bin/env python3
"""
Vision Benchmark Suite — Issue #817
Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview).
Measures OCR accuracy, description quality, latency, and token usage.
Usage:
# Run full benchmark
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
# Single image test
python benchmarks/vision_benchmark.py --url https://example.com/image.png
# Generate test report
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
"""
import argparse
import asyncio
import base64
import json
import os
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
# ---------------------------------------------------------------------------
# Benchmark configuration
# ---------------------------------------------------------------------------
# Models to compare
MODELS = {
"gemma4": {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"description": "Current default vision model",
},
}
# Evaluation prompts for different test categories
EVAL_PROMPTS = {
"screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?",
"diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.",
"photo": "Describe this photo in detail. What objects are visible? What is the scene?",
"ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.",
"chart": "What data does this chart show? List all axes labels, values, and key trends.",
"document": "Extract all text from this document image. Preserve paragraph structure.",
}
# ---------------------------------------------------------------------------
# Vision model interface
# ---------------------------------------------------------------------------
async def analyze_with_model(
image_url: str,
prompt: str,
model_config: dict,
timeout: float = 120.0,
) -> dict:
"""Call a vision model and return structured results.
Returns dict with:
- analysis: str
- latency_ms: float
- tokens: dict (prompt_tokens, completion_tokens, total_tokens)
- success: bool
- error: str (if failed)
"""
import httpx
provider = model_config["provider"]
model_id = model_config["model_id"]
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
# ---------------------------------------------------------------------------
def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float:
"""Compute OCR accuracy using character-level Levenshtein ratio.
Returns 0.0-1.0 (1.0 = perfect match).
"""
if not ground_truth:
return 1.0 if not extracted else 0.0
if not extracted:
return 0.0
# Normalized Levenshtein similarity
extracted_lower = extracted.lower().strip()
truth_lower = ground_truth.lower().strip()
# Simple character overlap ratio (fast proxy)
max_len = max(len(extracted_lower), len(truth_lower))
if max_len == 0:
return 1.0
# Count matching characters at matching positions
matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b)
position_ratio = matches / max_len
# Also check word-level overlap
extracted_words = set(extracted_lower.split())
truth_words = set(truth_lower.split())
if truth_words:
word_recall = len(extracted_words & truth_words) / len(truth_words)
else:
word_recall = 1.0 if not extracted_words else 0.0
return round((position_ratio * 0.4 + word_recall * 0.6), 4)
def compute_description_completeness(analysis: str, expected_keywords: list) -> float:
"""Score description completeness based on keyword coverage.
Returns 0.0-1.0.
"""
if not expected_keywords:
return 1.0
if not analysis:
return 0.0
analysis_lower = analysis.lower()
found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower)
return round(found / len(expected_keywords), 4)
def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict:
"""Evaluate structural elements of the analysis.
Returns dict with per-element scores.
"""
scores = {}
# Length check
min_length = expected_structure.get("min_length", 50)
scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0
# Sentence count
min_sentences = expected_structure.get("min_sentences", 2)
sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?")
scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0)
# Has specifics (numbers, names, etc.)
if expected_structure.get("has_numbers", False):
import re
scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0
return scores
# ---------------------------------------------------------------------------
# Benchmark runner
# ---------------------------------------------------------------------------
async def run_single_test(
image: dict,
models: dict,
runs_per_model: int = 1,
) -> dict:
"""Run a single image through all models.
Args:
image: dict with url, category, expected_keywords, ground_truth_ocr, etc.
models: dict of model configs to test
runs_per_model: number of runs per model (for consistency testing)
Returns dict with results per model.
"""
category = image.get("category", "photo")
prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"])
url = image["url"]
results = {}
for model_name, model_config in models.items():
runs = []
for run_i in range(runs_per_model):
result = await analyze_with_model(url, prompt, model_config)
runs.append(result)
if run_i < runs_per_model - 1:
await asyncio.sleep(1) # Rate limit courtesy
# Aggregate
successful = [r for r in runs if r["success"]]
if successful:
avg_latency = statistics.mean(r["latency_ms"] for r in successful)
avg_tokens = statistics.mean(
r["tokens"].get("total_tokens", 0) for r in successful
)
# Use first successful run for accuracy metrics
primary = successful[0]
# Compute accuracy
ocr_score = None
if image.get("ground_truth_ocr"):
ocr_score = compute_ocr_accuracy(
primary["analysis"], image["ground_truth_ocr"]
)
keyword_score = None
if image.get("expected_keywords"):
keyword_score = compute_description_completeness(
primary["analysis"], image["expected_keywords"]
)
structural = compute_structural_accuracy(
primary["analysis"], image.get("expected_structure", {})
)
results[model_name] = {
"success": True,
"analysis_preview": primary["analysis"][:300],
"analysis_length": len(primary["analysis"]),
"avg_latency_ms": round(avg_latency, 1),
"avg_tokens": round(avg_tokens, 1),
"ocr_accuracy": ocr_score,
"keyword_completeness": keyword_score,
"structural_scores": structural,
"consistency": round(
statistics.stdev(len(r["analysis"]) for r in successful), 1
) if len(successful) > 1 else 0.0,
"runs": len(successful),
"errors": len(runs) - len(successful),
}
else:
results[model_name] = {
"success": False,
"error": runs[0]["error"] if runs else "No runs",
"runs": 0,
"errors": len(runs),
}
return results
async def run_benchmark_suite(
images: List[dict],
models: dict,
runs_per_model: int = 1,
) -> dict:
"""Run the full benchmark suite.
Args:
images: list of image test cases
models: model configs to compare
runs_per_model: consistency runs per image
Returns structured benchmark report.
"""
total = len(images)
all_results = []
print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs")
print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n")
for i, image in enumerate(images):
img_id = image.get("id", f"img_{i}")
category = image.get("category", "unknown")
print(f" [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True)
result = await run_single_test(image, models, runs_per_model)
result["image_id"] = img_id
result["category"] = category
all_results.append(result)
# Quick status
statuses = []
for mname in models:
if result[mname]["success"]:
lat = result[mname]["avg_latency_ms"]
statuses.append(f"{mname}:{lat:.0f}ms")
else:
statuses.append(f"{mname}:FAIL")
print(", ".join(statuses))
# Aggregate statistics
summary = aggregate_results(all_results, models)
return {
"generated_at": datetime.now(timezone.utc).isoformat(),
"config": {
"total_images": total,
"runs_per_model": runs_per_model,
"models": {k: v["display_name"] for k, v in models.items()},
},
"results": all_results,
"summary": summary,
}
def aggregate_results(results: List[dict], models: dict) -> dict:
"""Compute aggregate statistics across all test images."""
summary = {}
for model_name in models:
model_results = [r[model_name] for r in results if r[model_name]["success"]]
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")]
ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None]
keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None]
summary[model_name] = {
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
"p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
"std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0,
},
"tokens": {
"mean_total": round(statistics.mean(tokens), 1) if tokens else 0,
"total_used": sum(int(t) for t in tokens),
},
"accuracy": {
"ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None,
"ocr_count": len(ocr_scores),
"keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None,
"keyword_count": len(keyword_scores),
},
}
return summary
# ---------------------------------------------------------------------------
# Report generation
# ---------------------------------------------------------------------------
def to_markdown(report: dict) -> str:
"""Generate human-readable markdown report."""
summary = report["summary"]
config = report["config"]
model_names = list(config["models"].values())
lines = [
"# Vision Benchmark Report",
"",
f"Generated: {report['generated_at'][:16]}",
f"Images tested: {config['total_images']}",
f"Runs per model: {config['runs_per_model']}",
f"Models: {', '.join(model_names)}",
"",
"## Latency Comparison",
"",
"| Model | Mean (ms) | Median | P95 | Std Dev |",
"|-------|-----------|--------|-----|---------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "latency" in summary[mkey]:
lat = summary[mkey]["latency"]
lines.append(
f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | "
f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |"
)
lines += [
"",
"## Accuracy Comparison",
"",
"| Model | OCR Accuracy | Keyword Coverage | Success Rate |",
"|-------|-------------|-----------------|--------------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "accuracy" in summary[mkey]:
acc = summary[mkey]["accuracy"]
sr = summary[mkey].get("success_rate", 0)
ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A"
kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A"
lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |")
lines += [
"",
"## Token Usage",
"",
"| Model | Mean Tokens/Image | Total Tokens |",
"|-------|------------------|--------------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "tokens" in summary[mkey]:
tok = summary[mkey]["tokens"]
lines.append(
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
# Verdict
lines += ["", "## Verdict", ""]
# Find best model by composite score
best_model = None
best_score = -1
for mkey, mname in config["models"].items():
if mkey not in summary or "accuracy" not in summary[mkey]:
continue
acc = summary[mkey]["accuracy"]
sr = summary[mkey].get("success_rate", 0)
ocr = acc["ocr_mean"] or 0
kw = acc["keyword_mean"] or 0
# Weighted composite: 40% OCR, 30% keyword, 30% success rate
score = (ocr * 0.4 + kw * 0.3 + sr * 0.3)
if score > best_score:
best_score = score
best_model = mname
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
else:
lines.append("No clear winner — insufficient data.")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Test dataset management
# ---------------------------------------------------------------------------
def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images.
Returns list of test image definitions.
"""
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"category": "diagram",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
]
def load_dataset(path: str) -> List[dict]:
"""Load test dataset from JSON file."""
with open(path) as f:
return json.load(f)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
async def main():
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
parser.add_argument("--images", help="Path to test images JSON file")
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
parser.add_argument("--generate-dataset", action="store_true",
help="Generate sample dataset and exit")
args = parser.parse_args()
if args.generate_dataset:
dataset = generate_sample_dataset()
out_path = args.images or "benchmarks/test_images.json"
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
with open(out_path, "w") as f:
json.dump(dataset, f, indent=2)
print(f"Generated sample dataset: {out_path} ({len(dataset)} images)")
return
# Select models
if args.models:
selected = {k: v for k, v in MODELS.items() if k in args.models}
else:
selected = MODELS
# Load images
if args.url:
images = [{"id": "single", "url": args.url, "category": args.category}]
elif args.images:
images = load_dataset(args.images)
else:
print("ERROR: Provide --images or --url")
sys.exit(1)
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
with open(args.output, "w") as f:
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown or not args.output:
print("\n" + to_markdown(report))
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,239 @@
"""Tests for vision benchmark suite (Issue #817)."""
import json
import statistics
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from vision_benchmark import (
compute_ocr_accuracy,
compute_description_completeness,
compute_structural_accuracy,
aggregate_results,
to_markdown,
generate_sample_dataset,
MODELS,
EVAL_PROMPTS,
)
class TestOcrAccuracy:
def test_perfect_match(self):
assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0
def test_empty_ground_truth(self):
assert compute_ocr_accuracy("", "") == 1.0
assert compute_ocr_accuracy("text", "") == 0.0
def test_empty_extraction(self):
assert compute_ocr_accuracy("", "Hello") == 0.0
def test_partial_match(self):
score = compute_ocr_accuracy("Hello Wrld", "Hello World")
assert 0.5 < score < 1.0
def test_case_insensitive(self):
assert compute_ocr_accuracy("hello world", "Hello World") == 1.0
def test_whitespace_differences(self):
score = compute_ocr_accuracy(" Hello World ", "Hello World")
assert score >= 0.8
class TestDescriptionCompleteness:
def test_all_keywords_found(self):
keywords = ["github", "logo", "octocat"]
text = "This is the GitHub logo featuring the octocat mascot."
assert compute_description_completeness(text, keywords) == 1.0
def test_partial_keywords(self):
keywords = ["github", "logo", "octocat"]
text = "This is the GitHub logo."
score = compute_description_completeness(text, keywords)
assert 0.3 < score < 0.7
def test_no_keywords(self):
keywords = ["github", "logo"]
text = "Something completely different."
assert compute_description_completeness(text, keywords) == 0.0
def test_empty_keywords(self):
assert compute_description_completeness("any text", []) == 1.0
def test_empty_text(self):
assert compute_description_completeness("", ["keyword"]) == 0.0
def test_case_insensitive(self):
keywords = ["GitHub", "Logo"]
text = "The github logo is iconic."
assert compute_description_completeness(text, keywords) == 1.0
class TestStructuralAccuracy:
def test_length_score(self):
text = "A" * 100
scores = compute_structural_accuracy(text, {"min_length": 50})
assert scores["length"] == 1.0
def test_short_text(self):
text = "Short."
scores = compute_structural_accuracy(text, {"min_length": 100})
assert scores["length"] < 1.0
def test_sentence_count(self):
text = "First sentence. Second sentence. Third sentence."
scores = compute_structural_accuracy(text, {"min_sentences": 2})
assert scores["sentences"] >= 1.0
def test_no_sentences(self):
text = "No sentence end"
scores = compute_structural_accuracy(text, {"min_sentences": 1})
assert scores["sentences"] == 0.0
def test_has_numbers_true(self):
text = "There are 42 items."
scores = compute_structural_accuracy(text, {"has_numbers": True})
assert scores["has_numbers"] == 1.0
def test_has_numbers_false(self):
text = "No numbers here."
scores = compute_structural_accuracy(text, {"has_numbers": True})
assert scores["has_numbers"] == 0.0
class TestAggregateResults:
def test_basic_aggregation(self):
results = [
{
"image_id": "img1",
"category": "photo",
"gemma4": {
"success": True,
"avg_latency_ms": 100,
"avg_tokens": 500,
"ocr_accuracy": 0.9,
"keyword_completeness": 0.8,
"analysis_length": 200,
},
"gemini3_flash": {
"success": True,
"avg_latency_ms": 150,
"avg_tokens": 600,
"ocr_accuracy": 0.85,
"keyword_completeness": 0.75,
"analysis_length": 180,
},
}
]
models = MODELS
summary = aggregate_results(results, models)
assert "gemma4" in summary
assert "gemini3_flash" in summary
assert summary["gemma4"]["success_rate"] == 1.0
assert summary["gemma4"]["latency"]["mean_ms"] == 100
assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9
def test_all_failures(self):
results = [
{
"image_id": "img1",
"category": "photo",
"gemma4": {"success": False, "error": "API error"},
"gemini3_flash": {"success": False, "error": "API error"},
}
]
summary = aggregate_results(results, MODELS)
assert summary["gemma4"]["success_rate"] == 0
class TestMarkdown:
def test_generates_report(self):
report = {
"generated_at": "2026-04-16T00:00:00",
"config": {
"total_images": 10,
"runs_per_model": 1,
"models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"},
},
"summary": {
"gemma4": {
"success_rate": 0.9,
"latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20},
"tokens": {"mean_total": 500, "total_used": 5000},
"accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5},
},
"gemini3_flash": {
"success_rate": 0.95,
"latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25},
"tokens": {"mean_total": 600, "total_used": 6000},
"accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5},
},
},
"results": [],
}
md = to_markdown(report)
assert "Vision Benchmark Report" in md
assert "Latency Comparison" in md
assert "Accuracy Comparison" in md
assert "Token Usage" in md
assert "Verdict" in md
assert "Gemma 4 27B" in md
def test_empty_report(self):
report = {
"generated_at": "2026-04-16T00:00:00",
"config": {"total_images": 0, "runs_per_model": 1, "models": {}},
"summary": {},
"results": [],
}
md = to_markdown(report)
assert "Vision Benchmark Report" in md
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
for img in dataset:
assert "id" in img
assert "url" in img
assert "category" in img
assert "expected_keywords" in img
assert "expected_structure" in img
def test_categories_present(self):
dataset = generate_sample_dataset()
categories = {img["category"] for img in dataset}
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
class TestModels:
def test_all_models_defined(self):
assert "gemma4" in MODELS
assert "gemini3_flash" in MODELS
def test_model_structure(self):
for name, config in MODELS.items():
assert "model_id" in config
assert "display_name" in config
assert "provider" in config
class TestPrompts:
def test_prompts_for_categories(self):
assert "screenshot" in EVAL_PROMPTS
assert "diagram" in EVAL_PROMPTS
assert "photo" in EVAL_PROMPTS
assert "ocr" in EVAL_PROMPTS
assert "chart" in EVAL_PROMPTS