#!/usr/bin/env python3 """ Vision Benchmark Suite — Issue #817 Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview). Measures OCR accuracy, description quality, latency, and token usage. Usage: # Run full benchmark python benchmarks/vision_benchmark.py --images benchmarks/test_images.json # Single image test python benchmarks/vision_benchmark.py --url https://example.com/image.png # Generate test report python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json Test image dataset: benchmarks/test_images.json (50-100 diverse images) """ import argparse import asyncio import base64 import json import os import statistics import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional # --------------------------------------------------------------------------- # Benchmark configuration # --------------------------------------------------------------------------- # Models to compare MODELS = { "gemma4": { "model_id": "google/gemma-4-27b-it", "display_name": "Gemma 4 27B", "provider": "nous", "description": "Google's multimodal Gemma 4 model", }, "gemini3_flash": { "model_id": "google/gemini-3-flash-preview", "display_name": "Gemini 3 Flash Preview", "provider": "openrouter", "description": "Current default vision model", }, } # Evaluation prompts for different test categories EVAL_PROMPTS = { "screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?", "diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.", "photo": "Describe this photo in detail. What objects are visible? What is the scene?", "ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.", "chart": "What data does this chart show? List all axes labels, values, and key trends.", "document": "Extract all text from this document image. Preserve paragraph structure.", } # --------------------------------------------------------------------------- # Vision model interface # --------------------------------------------------------------------------- async def analyze_with_model( image_url: str, prompt: str, model_config: dict, timeout: float = 120.0, ) -> dict: """Call a vision model and return structured results. Returns dict with: - analysis: str - latency_ms: float - tokens: dict (prompt_tokens, completion_tokens, total_tokens) - success: bool - error: str (if failed) """ import httpx provider = model_config["provider"] model_id = model_config["model_id"] # Prepare messages messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": image_url}}, ], } ] # Route to provider if provider == "openrouter": api_url = "https://openrouter.ai/api/v1/chat/completions" api_key = os.getenv("OPENROUTER_API_KEY", "") elif provider == "nous": api_url = "https://inference.nousresearch.com/v1/chat/completions" api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "") else: api_url = os.getenv(f"{provider.upper()}_API_URL", "") api_key = os.getenv(f"{provider.upper()}_API_KEY", "") if not api_key: return { "analysis": "", "latency_ms": 0, "tokens": {}, "success": False, "error": f"No API key for provider {provider}", } headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } payload = { "model": model_id, "messages": messages, "max_tokens": 2000, "temperature": 0.1, } start = time.perf_counter() try: async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.post(api_url, json=payload, headers=headers) resp.raise_for_status() data = resp.json() latency_ms = (time.perf_counter() - start) * 1000 analysis = "" choices = data.get("choices", []) if choices: msg = choices[0].get("message", {}) analysis = msg.get("content", "") usage = data.get("usage", {}) tokens = { "prompt_tokens": usage.get("prompt_tokens", 0), "completion_tokens": usage.get("completion_tokens", 0), "total_tokens": usage.get("total_tokens", 0), } return { "analysis": analysis, "latency_ms": round(latency_ms, 1), "tokens": tokens, "success": True, "error": "", } except Exception as e: return { "analysis": "", "latency_ms": round((time.perf_counter() - start) * 1000, 1), "tokens": {}, "success": False, "error": str(e), } # --------------------------------------------------------------------------- # Evaluation metrics # --------------------------------------------------------------------------- def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float: """Compute OCR accuracy using character-level Levenshtein ratio. Returns 0.0-1.0 (1.0 = perfect match). """ if not ground_truth: return 1.0 if not extracted else 0.0 if not extracted: return 0.0 # Normalized Levenshtein similarity extracted_lower = extracted.lower().strip() truth_lower = ground_truth.lower().strip() # Simple character overlap ratio (fast proxy) max_len = max(len(extracted_lower), len(truth_lower)) if max_len == 0: return 1.0 # Count matching characters at matching positions matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b) position_ratio = matches / max_len # Also check word-level overlap extracted_words = set(extracted_lower.split()) truth_words = set(truth_lower.split()) if truth_words: word_recall = len(extracted_words & truth_words) / len(truth_words) else: word_recall = 1.0 if not extracted_words else 0.0 return round((position_ratio * 0.4 + word_recall * 0.6), 4) def compute_description_completeness(analysis: str, expected_keywords: list) -> float: """Score description completeness based on keyword coverage. Returns 0.0-1.0. """ if not expected_keywords: return 1.0 if not analysis: return 0.0 analysis_lower = analysis.lower() found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower) return round(found / len(expected_keywords), 4) def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict: """Evaluate structural elements of the analysis. Returns dict with per-element scores. """ scores = {} # Length check min_length = expected_structure.get("min_length", 50) scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0 # Sentence count min_sentences = expected_structure.get("min_sentences", 2) sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?") scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0) # Has specifics (numbers, names, etc.) if expected_structure.get("has_numbers", False): import re scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0 return scores # --------------------------------------------------------------------------- # Benchmark runner # --------------------------------------------------------------------------- async def run_single_test( image: dict, models: dict, runs_per_model: int = 1, ) -> dict: """Run a single image through all models. Args: image: dict with url, category, expected_keywords, ground_truth_ocr, etc. models: dict of model configs to test runs_per_model: number of runs per model (for consistency testing) Returns dict with results per model. """ category = image.get("category", "photo") prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"]) url = image["url"] results = {} for model_name, model_config in models.items(): runs = [] for run_i in range(runs_per_model): result = await analyze_with_model(url, prompt, model_config) runs.append(result) if run_i < runs_per_model - 1: await asyncio.sleep(1) # Rate limit courtesy # Aggregate successful = [r for r in runs if r["success"]] if successful: avg_latency = statistics.mean(r["latency_ms"] for r in successful) avg_tokens = statistics.mean( r["tokens"].get("total_tokens", 0) for r in successful ) # Use first successful run for accuracy metrics primary = successful[0] # Compute accuracy ocr_score = None if image.get("ground_truth_ocr"): ocr_score = compute_ocr_accuracy( primary["analysis"], image["ground_truth_ocr"] ) keyword_score = None if image.get("expected_keywords"): keyword_score = compute_description_completeness( primary["analysis"], image["expected_keywords"] ) structural = compute_structural_accuracy( primary["analysis"], image.get("expected_structure", {}) ) results[model_name] = { "success": True, "analysis_preview": primary["analysis"][:300], "analysis_length": len(primary["analysis"]), "avg_latency_ms": round(avg_latency, 1), "avg_tokens": round(avg_tokens, 1), "ocr_accuracy": ocr_score, "keyword_completeness": keyword_score, "structural_scores": structural, "consistency": round( statistics.stdev(len(r["analysis"]) for r in successful), 1 ) if len(successful) > 1 else 0.0, "runs": len(successful), "errors": len(runs) - len(successful), } else: results[model_name] = { "success": False, "error": runs[0]["error"] if runs else "No runs", "runs": 0, "errors": len(runs), } return results async def run_benchmark_suite( images: List[dict], models: dict, runs_per_model: int = 1, ) -> dict: """Run the full benchmark suite. Args: images: list of image test cases models: model configs to compare runs_per_model: consistency runs per image Returns structured benchmark report. """ total = len(images) all_results = [] print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs") print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n") for i, image in enumerate(images): img_id = image.get("id", f"img_{i}") category = image.get("category", "unknown") print(f" [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True) result = await run_single_test(image, models, runs_per_model) result["image_id"] = img_id result["category"] = category all_results.append(result) # Quick status statuses = [] for mname in models: if result[mname]["success"]: lat = result[mname]["avg_latency_ms"] statuses.append(f"{mname}:{lat:.0f}ms") else: statuses.append(f"{mname}:FAIL") print(", ".join(statuses)) # Aggregate statistics summary = aggregate_results(all_results, models) return { "generated_at": datetime.now(timezone.utc).isoformat(), "config": { "total_images": total, "runs_per_model": runs_per_model, "models": {k: v["display_name"] for k, v in models.items()}, }, "results": all_results, "summary": summary, } def aggregate_results(results: List[dict], models: dict) -> dict: """Compute aggregate statistics across all test images.""" summary = {} for model_name in models: model_results = [r[model_name] for r in results if r[model_name]["success"]] failed = [r[model_name] for r in results if not r[model_name]["success"]] if not model_results: summary[model_name] = {"success_rate": 0, "error": "All runs failed"} continue latencies = [r["avg_latency_ms"] for r in model_results] tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")] ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None] keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None] summary[model_name] = { "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4), "total_runs": len(model_results), "total_failures": len(failed), "latency": { "mean_ms": round(statistics.mean(latencies), 1), "median_ms": round(statistics.median(latencies), 1), "p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1), "std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0, }, "tokens": { "mean_total": round(statistics.mean(tokens), 1) if tokens else 0, "total_used": sum(int(t) for t in tokens), }, "accuracy": { "ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None, "ocr_count": len(ocr_scores), "keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None, "keyword_count": len(keyword_scores), }, } return summary # --------------------------------------------------------------------------- # Report generation # --------------------------------------------------------------------------- def to_markdown(report: dict) -> str: """Generate human-readable markdown report.""" summary = report["summary"] config = report["config"] model_names = list(config["models"].values()) lines = [ "# Vision Benchmark Report", "", f"Generated: {report['generated_at'][:16]}", f"Images tested: {config['total_images']}", f"Runs per model: {config['runs_per_model']}", f"Models: {', '.join(model_names)}", "", "## Latency Comparison", "", "| Model | Mean (ms) | Median | P95 | Std Dev |", "|-------|-----------|--------|-----|---------|", ] for mkey, mname in config["models"].items(): if mkey in summary and "latency" in summary[mkey]: lat = summary[mkey]["latency"] lines.append( f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | " f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |" ) lines += [ "", "## Accuracy Comparison", "", "| Model | OCR Accuracy | Keyword Coverage | Success Rate |", "|-------|-------------|-----------------|--------------|", ] for mkey, mname in config["models"].items(): if mkey in summary and "accuracy" in summary[mkey]: acc = summary[mkey]["accuracy"] sr = summary[mkey].get("success_rate", 0) ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A" kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A" lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |") lines += [ "", "## Token Usage", "", "| Model | Mean Tokens/Image | Total Tokens |", "|-------|------------------|--------------|", ] for mkey, mname in config["models"].items(): if mkey in summary and "tokens" in summary[mkey]: tok = summary[mkey]["tokens"] lines.append( f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |" ) # Verdict lines += ["", "## Verdict", ""] # Find best model by composite score best_model = None best_score = -1 for mkey, mname in config["models"].items(): if mkey not in summary or "accuracy" not in summary[mkey]: continue acc = summary[mkey]["accuracy"] sr = summary[mkey].get("success_rate", 0) ocr = acc["ocr_mean"] or 0 kw = acc["keyword_mean"] or 0 # Weighted composite: 40% OCR, 30% keyword, 30% success rate score = (ocr * 0.4 + kw * 0.3 + sr * 0.3) if score > best_score: best_score = score best_model = mname if best_model: lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})") else: lines.append("No clear winner — insufficient data.") return "\n".join(lines) # --------------------------------------------------------------------------- # Test dataset management # --------------------------------------------------------------------------- def generate_sample_dataset() -> List[dict]: """Generate a sample test dataset with diverse public images. Returns list of test image definitions. """ return [ # Screenshots { "id": "screenshot_github", "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", "category": "screenshot", "expected_keywords": ["github", "logo", "octocat"], "expected_structure": {"min_length": 50, "min_sentences": 2}, }, # Diagrams { "id": "diagram_architecture", "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw", "category": "diagram", "expected_keywords": ["architecture", "component", "service"], "expected_structure": {"min_length": 100, "min_sentences": 3}, }, # Photos { "id": "photo_nature", "url": "https://picsum.photos/seed/bench1/400/300", "category": "photo", "expected_keywords": [], "expected_structure": {"min_length": 30, "min_sentences": 1}, }, # Charts { "id": "chart_bar", "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}", "category": "chart", "expected_keywords": ["bar", "chart", "data"], "expected_structure": {"min_length": 50, "min_sentences": 2}, }, ] def load_dataset(path: str) -> List[dict]: """Load test dataset from JSON file.""" with open(path) as f: return json.load(f) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- async def main(): parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)") parser.add_argument("--images", help="Path to test images JSON file") parser.add_argument("--url", help="Single image URL to test") parser.add_argument("--category", default="photo", help="Category for single URL") parser.add_argument("--output", default=None, help="Output JSON file") parser.add_argument("--runs", type=int, default=1, help="Runs per model per image") parser.add_argument("--models", nargs="+", default=None, help="Models to test (default: all)") parser.add_argument("--markdown", action="store_true", help="Output markdown report") parser.add_argument("--generate-dataset", action="store_true", help="Generate sample dataset and exit") args = parser.parse_args() if args.generate_dataset: dataset = generate_sample_dataset() out_path = args.images or "benchmarks/test_images.json" os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True) with open(out_path, "w") as f: json.dump(dataset, f, indent=2) print(f"Generated sample dataset: {out_path} ({len(dataset)} images)") return # Select models if args.models: selected = {k: v for k, v in MODELS.items() if k in args.models} else: selected = MODELS # Load images if args.url: images = [{"id": "single", "url": args.url, "category": args.category}] elif args.images: images = load_dataset(args.images) else: print("ERROR: Provide --images or --url") sys.exit(1) # Run benchmark report = await run_benchmark_suite(images, selected, args.runs) # Output if args.output: os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) with open(args.output, "w") as f: json.dump(report, f, indent=2) print(f"\nResults saved to {args.output}") if args.markdown or not args.output: print("\n" + to_markdown(report)) if __name__ == "__main__": asyncio.run(main())