"""Tests for vision benchmark suite (Issue #817).""" import json import statistics import sys from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks")) from vision_benchmark import ( compute_ocr_accuracy, compute_description_completeness, compute_structural_accuracy, aggregate_results, to_markdown, generate_sample_dataset, MODELS, EVAL_PROMPTS, ) class TestOcrAccuracy: def test_perfect_match(self): assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0 def test_empty_ground_truth(self): assert compute_ocr_accuracy("", "") == 1.0 assert compute_ocr_accuracy("text", "") == 0.0 def test_empty_extraction(self): assert compute_ocr_accuracy("", "Hello") == 0.0 def test_partial_match(self): score = compute_ocr_accuracy("Hello Wrld", "Hello World") assert 0.5 < score < 1.0 def test_case_insensitive(self): assert compute_ocr_accuracy("hello world", "Hello World") == 1.0 def test_whitespace_differences(self): score = compute_ocr_accuracy(" Hello World ", "Hello World") assert score >= 0.8 class TestDescriptionCompleteness: def test_all_keywords_found(self): keywords = ["github", "logo", "octocat"] text = "This is the GitHub logo featuring the octocat mascot." assert compute_description_completeness(text, keywords) == 1.0 def test_partial_keywords(self): keywords = ["github", "logo", "octocat"] text = "This is the GitHub logo." score = compute_description_completeness(text, keywords) assert 0.3 < score < 0.7 def test_no_keywords(self): keywords = ["github", "logo"] text = "Something completely different." assert compute_description_completeness(text, keywords) == 0.0 def test_empty_keywords(self): assert compute_description_completeness("any text", []) == 1.0 def test_empty_text(self): assert compute_description_completeness("", ["keyword"]) == 0.0 def test_case_insensitive(self): keywords = ["GitHub", "Logo"] text = "The github logo is iconic." assert compute_description_completeness(text, keywords) == 1.0 class TestStructuralAccuracy: def test_length_score(self): text = "A" * 100 scores = compute_structural_accuracy(text, {"min_length": 50}) assert scores["length"] == 1.0 def test_short_text(self): text = "Short." scores = compute_structural_accuracy(text, {"min_length": 100}) assert scores["length"] < 1.0 def test_sentence_count(self): text = "First sentence. Second sentence. Third sentence." scores = compute_structural_accuracy(text, {"min_sentences": 2}) assert scores["sentences"] >= 1.0 def test_no_sentences(self): text = "No sentence end" scores = compute_structural_accuracy(text, {"min_sentences": 1}) assert scores["sentences"] == 0.0 def test_has_numbers_true(self): text = "There are 42 items." scores = compute_structural_accuracy(text, {"has_numbers": True}) assert scores["has_numbers"] == 1.0 def test_has_numbers_false(self): text = "No numbers here." scores = compute_structural_accuracy(text, {"has_numbers": True}) assert scores["has_numbers"] == 0.0 class TestAggregateResults: def test_basic_aggregation(self): results = [ { "image_id": "img1", "category": "photo", "gemma4": { "success": True, "avg_latency_ms": 100, "avg_tokens": 500, "ocr_accuracy": 0.9, "keyword_completeness": 0.8, "analysis_length": 200, }, "gemini3_flash": { "success": True, "avg_latency_ms": 150, "avg_tokens": 600, "ocr_accuracy": 0.85, "keyword_completeness": 0.75, "analysis_length": 180, }, } ] models = MODELS summary = aggregate_results(results, models) assert "gemma4" in summary assert "gemini3_flash" in summary assert summary["gemma4"]["success_rate"] == 1.0 assert summary["gemma4"]["latency"]["mean_ms"] == 100 assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9 def test_all_failures(self): results = [ { "image_id": "img1", "category": "photo", "gemma4": {"success": False, "error": "API error"}, "gemini3_flash": {"success": False, "error": "API error"}, } ] summary = aggregate_results(results, MODELS) assert summary["gemma4"]["success_rate"] == 0 class TestMarkdown: def test_generates_report(self): report = { "generated_at": "2026-04-16T00:00:00", "config": { "total_images": 10, "runs_per_model": 1, "models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"}, }, "summary": { "gemma4": { "success_rate": 0.9, "latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20}, "tokens": {"mean_total": 500, "total_used": 5000}, "accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5}, }, "gemini3_flash": { "success_rate": 0.95, "latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25}, "tokens": {"mean_total": 600, "total_used": 6000}, "accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5}, }, }, "results": [], } md = to_markdown(report) assert "Vision Benchmark Report" in md assert "Latency Comparison" in md assert "Accuracy Comparison" in md assert "Token Usage" in md assert "Verdict" in md assert "Gemma 4 27B" in md def test_empty_report(self): report = { "generated_at": "2026-04-16T00:00:00", "config": {"total_images": 0, "runs_per_model": 1, "models": {}}, "summary": {}, "results": [], } md = to_markdown(report) assert "Vision Benchmark Report" in md class TestDataset: def test_sample_dataset_has_entries(self): dataset = generate_sample_dataset() assert len(dataset) >= 4 def test_sample_dataset_structure(self): dataset = generate_sample_dataset() for img in dataset: assert "id" in img assert "url" in img assert "category" in img assert "expected_keywords" in img assert "expected_structure" in img def test_categories_present(self): dataset = generate_sample_dataset() categories = {img["category"] for img in dataset} assert "screenshot" in categories assert "diagram" in categories assert "photo" in categories class TestModels: def test_all_models_defined(self): assert "gemma4" in MODELS assert "gemini3_flash" in MODELS def test_model_structure(self): for name, config in MODELS.items(): assert "model_id" in config assert "display_name" in config assert "provider" in config class TestPrompts: def test_prompts_for_categories(self): assert "screenshot" in EVAL_PROMPTS assert "diagram" in EVAL_PROMPTS assert "photo" in EVAL_PROMPTS assert "ocr" in EVAL_PROMPTS assert "chart" in EVAL_PROMPTS