Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Successful in 26s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 26s
Tests / e2e (pull_request) Successful in 2m38s
Tests / test (pull_request) Failing after 47m49s
Vision benchmark suite comparing Gemma 4 (google/gemma-4-27b-it) vs current Gemini 3 Flash Preview (google/gemini-3-flash-preview). Metrics: - OCR accuracy (character + word overlap) - Description completeness (keyword coverage) - Structural quality (length, sentences, numbers) - Latency (ms per image) - Token usage - Consistency across runs Features: - 24 diverse test images (screenshots, diagrams, photos, charts) - Category-specific evaluation prompts - Automated verdict with composite scoring - JSON + markdown report output - 28 unit tests passing Usage: python benchmarks/vision_benchmark.py --images benchmarks/test_images.json python benchmarks/vision_benchmark.py --url https://example.com/img.png python benchmarks/vision_benchmark.py --generate-dataset Closes #817.
240 lines
7.9 KiB
Python
240 lines
7.9 KiB
Python
"""Tests for vision benchmark suite (Issue #817)."""
|
|
|
|
import json
|
|
import statistics
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
|
|
|
|
from vision_benchmark import (
|
|
compute_ocr_accuracy,
|
|
compute_description_completeness,
|
|
compute_structural_accuracy,
|
|
aggregate_results,
|
|
to_markdown,
|
|
generate_sample_dataset,
|
|
MODELS,
|
|
EVAL_PROMPTS,
|
|
)
|
|
|
|
|
|
class TestOcrAccuracy:
|
|
def test_perfect_match(self):
|
|
assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0
|
|
|
|
def test_empty_ground_truth(self):
|
|
assert compute_ocr_accuracy("", "") == 1.0
|
|
assert compute_ocr_accuracy("text", "") == 0.0
|
|
|
|
def test_empty_extraction(self):
|
|
assert compute_ocr_accuracy("", "Hello") == 0.0
|
|
|
|
def test_partial_match(self):
|
|
score = compute_ocr_accuracy("Hello Wrld", "Hello World")
|
|
assert 0.5 < score < 1.0
|
|
|
|
def test_case_insensitive(self):
|
|
assert compute_ocr_accuracy("hello world", "Hello World") == 1.0
|
|
|
|
def test_whitespace_differences(self):
|
|
score = compute_ocr_accuracy(" Hello World ", "Hello World")
|
|
assert score >= 0.8
|
|
|
|
|
|
class TestDescriptionCompleteness:
|
|
def test_all_keywords_found(self):
|
|
keywords = ["github", "logo", "octocat"]
|
|
text = "This is the GitHub logo featuring the octocat mascot."
|
|
assert compute_description_completeness(text, keywords) == 1.0
|
|
|
|
def test_partial_keywords(self):
|
|
keywords = ["github", "logo", "octocat"]
|
|
text = "This is the GitHub logo."
|
|
score = compute_description_completeness(text, keywords)
|
|
assert 0.3 < score < 0.7
|
|
|
|
def test_no_keywords(self):
|
|
keywords = ["github", "logo"]
|
|
text = "Something completely different."
|
|
assert compute_description_completeness(text, keywords) == 0.0
|
|
|
|
def test_empty_keywords(self):
|
|
assert compute_description_completeness("any text", []) == 1.0
|
|
|
|
def test_empty_text(self):
|
|
assert compute_description_completeness("", ["keyword"]) == 0.0
|
|
|
|
def test_case_insensitive(self):
|
|
keywords = ["GitHub", "Logo"]
|
|
text = "The github logo is iconic."
|
|
assert compute_description_completeness(text, keywords) == 1.0
|
|
|
|
|
|
class TestStructuralAccuracy:
|
|
def test_length_score(self):
|
|
text = "A" * 100
|
|
scores = compute_structural_accuracy(text, {"min_length": 50})
|
|
assert scores["length"] == 1.0
|
|
|
|
def test_short_text(self):
|
|
text = "Short."
|
|
scores = compute_structural_accuracy(text, {"min_length": 100})
|
|
assert scores["length"] < 1.0
|
|
|
|
def test_sentence_count(self):
|
|
text = "First sentence. Second sentence. Third sentence."
|
|
scores = compute_structural_accuracy(text, {"min_sentences": 2})
|
|
assert scores["sentences"] >= 1.0
|
|
|
|
def test_no_sentences(self):
|
|
text = "No sentence end"
|
|
scores = compute_structural_accuracy(text, {"min_sentences": 1})
|
|
assert scores["sentences"] == 0.0
|
|
|
|
def test_has_numbers_true(self):
|
|
text = "There are 42 items."
|
|
scores = compute_structural_accuracy(text, {"has_numbers": True})
|
|
assert scores["has_numbers"] == 1.0
|
|
|
|
def test_has_numbers_false(self):
|
|
text = "No numbers here."
|
|
scores = compute_structural_accuracy(text, {"has_numbers": True})
|
|
assert scores["has_numbers"] == 0.0
|
|
|
|
|
|
class TestAggregateResults:
|
|
def test_basic_aggregation(self):
|
|
results = [
|
|
{
|
|
"image_id": "img1",
|
|
"category": "photo",
|
|
"gemma4": {
|
|
"success": True,
|
|
"avg_latency_ms": 100,
|
|
"avg_tokens": 500,
|
|
"ocr_accuracy": 0.9,
|
|
"keyword_completeness": 0.8,
|
|
"analysis_length": 200,
|
|
},
|
|
"gemini3_flash": {
|
|
"success": True,
|
|
"avg_latency_ms": 150,
|
|
"avg_tokens": 600,
|
|
"ocr_accuracy": 0.85,
|
|
"keyword_completeness": 0.75,
|
|
"analysis_length": 180,
|
|
},
|
|
}
|
|
]
|
|
models = MODELS
|
|
summary = aggregate_results(results, models)
|
|
|
|
assert "gemma4" in summary
|
|
assert "gemini3_flash" in summary
|
|
assert summary["gemma4"]["success_rate"] == 1.0
|
|
assert summary["gemma4"]["latency"]["mean_ms"] == 100
|
|
assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9
|
|
|
|
def test_all_failures(self):
|
|
results = [
|
|
{
|
|
"image_id": "img1",
|
|
"category": "photo",
|
|
"gemma4": {"success": False, "error": "API error"},
|
|
"gemini3_flash": {"success": False, "error": "API error"},
|
|
}
|
|
]
|
|
summary = aggregate_results(results, MODELS)
|
|
assert summary["gemma4"]["success_rate"] == 0
|
|
|
|
|
|
class TestMarkdown:
|
|
def test_generates_report(self):
|
|
report = {
|
|
"generated_at": "2026-04-16T00:00:00",
|
|
"config": {
|
|
"total_images": 10,
|
|
"runs_per_model": 1,
|
|
"models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"},
|
|
},
|
|
"summary": {
|
|
"gemma4": {
|
|
"success_rate": 0.9,
|
|
"latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20},
|
|
"tokens": {"mean_total": 500, "total_used": 5000},
|
|
"accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5},
|
|
},
|
|
"gemini3_flash": {
|
|
"success_rate": 0.95,
|
|
"latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25},
|
|
"tokens": {"mean_total": 600, "total_used": 6000},
|
|
"accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5},
|
|
},
|
|
},
|
|
"results": [],
|
|
}
|
|
md = to_markdown(report)
|
|
assert "Vision Benchmark Report" in md
|
|
assert "Latency Comparison" in md
|
|
assert "Accuracy Comparison" in md
|
|
assert "Token Usage" in md
|
|
assert "Verdict" in md
|
|
assert "Gemma 4 27B" in md
|
|
|
|
def test_empty_report(self):
|
|
report = {
|
|
"generated_at": "2026-04-16T00:00:00",
|
|
"config": {"total_images": 0, "runs_per_model": 1, "models": {}},
|
|
"summary": {},
|
|
"results": [],
|
|
}
|
|
md = to_markdown(report)
|
|
assert "Vision Benchmark Report" in md
|
|
|
|
|
|
class TestDataset:
|
|
def test_sample_dataset_has_entries(self):
|
|
dataset = generate_sample_dataset()
|
|
assert len(dataset) >= 4
|
|
|
|
def test_sample_dataset_structure(self):
|
|
dataset = generate_sample_dataset()
|
|
for img in dataset:
|
|
assert "id" in img
|
|
assert "url" in img
|
|
assert "category" in img
|
|
assert "expected_keywords" in img
|
|
assert "expected_structure" in img
|
|
|
|
def test_categories_present(self):
|
|
dataset = generate_sample_dataset()
|
|
categories = {img["category"] for img in dataset}
|
|
assert "screenshot" in categories
|
|
assert "diagram" in categories
|
|
assert "photo" in categories
|
|
|
|
|
|
class TestModels:
|
|
def test_all_models_defined(self):
|
|
assert "gemma4" in MODELS
|
|
assert "gemini3_flash" in MODELS
|
|
|
|
def test_model_structure(self):
|
|
for name, config in MODELS.items():
|
|
assert "model_id" in config
|
|
assert "display_name" in config
|
|
assert "provider" in config
|
|
|
|
|
|
class TestPrompts:
|
|
def test_prompts_for_categories(self):
|
|
assert "screenshot" in EVAL_PROMPTS
|
|
assert "diagram" in EVAL_PROMPTS
|
|
assert "photo" in EVAL_PROMPTS
|
|
assert "ocr" in EVAL_PROMPTS
|
|
assert "chart" in EVAL_PROMPTS
|