Files
hermes-agent/tests/test_vision_benchmark.py
Timmy eed87e454e
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Successful in 26s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 26s
Tests / e2e (pull_request) Successful in 2m38s
Tests / test (pull_request) Failing after 47m49s
test: Benchmark Gemma 4 vision accuracy vs current approach (#817)
Vision benchmark suite comparing Gemma 4 (google/gemma-4-27b-it) vs
current Gemini 3 Flash Preview (google/gemini-3-flash-preview).

Metrics:
- OCR accuracy (character + word overlap)
- Description completeness (keyword coverage)
- Structural quality (length, sentences, numbers)
- Latency (ms per image)
- Token usage
- Consistency across runs

Features:
- 24 diverse test images (screenshots, diagrams, photos, charts)
- Category-specific evaluation prompts
- Automated verdict with composite scoring
- JSON + markdown report output
- 28 unit tests passing

Usage:
  python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
  python benchmarks/vision_benchmark.py --url https://example.com/img.png
  python benchmarks/vision_benchmark.py --generate-dataset

Closes #817.
2026-04-15 23:02:02 -04:00

240 lines
7.9 KiB
Python

"""Tests for vision benchmark suite (Issue #817)."""
import json
import statistics
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from vision_benchmark import (
compute_ocr_accuracy,
compute_description_completeness,
compute_structural_accuracy,
aggregate_results,
to_markdown,
generate_sample_dataset,
MODELS,
EVAL_PROMPTS,
)
class TestOcrAccuracy:
def test_perfect_match(self):
assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0
def test_empty_ground_truth(self):
assert compute_ocr_accuracy("", "") == 1.0
assert compute_ocr_accuracy("text", "") == 0.0
def test_empty_extraction(self):
assert compute_ocr_accuracy("", "Hello") == 0.0
def test_partial_match(self):
score = compute_ocr_accuracy("Hello Wrld", "Hello World")
assert 0.5 < score < 1.0
def test_case_insensitive(self):
assert compute_ocr_accuracy("hello world", "Hello World") == 1.0
def test_whitespace_differences(self):
score = compute_ocr_accuracy(" Hello World ", "Hello World")
assert score >= 0.8
class TestDescriptionCompleteness:
def test_all_keywords_found(self):
keywords = ["github", "logo", "octocat"]
text = "This is the GitHub logo featuring the octocat mascot."
assert compute_description_completeness(text, keywords) == 1.0
def test_partial_keywords(self):
keywords = ["github", "logo", "octocat"]
text = "This is the GitHub logo."
score = compute_description_completeness(text, keywords)
assert 0.3 < score < 0.7
def test_no_keywords(self):
keywords = ["github", "logo"]
text = "Something completely different."
assert compute_description_completeness(text, keywords) == 0.0
def test_empty_keywords(self):
assert compute_description_completeness("any text", []) == 1.0
def test_empty_text(self):
assert compute_description_completeness("", ["keyword"]) == 0.0
def test_case_insensitive(self):
keywords = ["GitHub", "Logo"]
text = "The github logo is iconic."
assert compute_description_completeness(text, keywords) == 1.0
class TestStructuralAccuracy:
def test_length_score(self):
text = "A" * 100
scores = compute_structural_accuracy(text, {"min_length": 50})
assert scores["length"] == 1.0
def test_short_text(self):
text = "Short."
scores = compute_structural_accuracy(text, {"min_length": 100})
assert scores["length"] < 1.0
def test_sentence_count(self):
text = "First sentence. Second sentence. Third sentence."
scores = compute_structural_accuracy(text, {"min_sentences": 2})
assert scores["sentences"] >= 1.0
def test_no_sentences(self):
text = "No sentence end"
scores = compute_structural_accuracy(text, {"min_sentences": 1})
assert scores["sentences"] == 0.0
def test_has_numbers_true(self):
text = "There are 42 items."
scores = compute_structural_accuracy(text, {"has_numbers": True})
assert scores["has_numbers"] == 1.0
def test_has_numbers_false(self):
text = "No numbers here."
scores = compute_structural_accuracy(text, {"has_numbers": True})
assert scores["has_numbers"] == 0.0
class TestAggregateResults:
def test_basic_aggregation(self):
results = [
{
"image_id": "img1",
"category": "photo",
"gemma4": {
"success": True,
"avg_latency_ms": 100,
"avg_tokens": 500,
"ocr_accuracy": 0.9,
"keyword_completeness": 0.8,
"analysis_length": 200,
},
"gemini3_flash": {
"success": True,
"avg_latency_ms": 150,
"avg_tokens": 600,
"ocr_accuracy": 0.85,
"keyword_completeness": 0.75,
"analysis_length": 180,
},
}
]
models = MODELS
summary = aggregate_results(results, models)
assert "gemma4" in summary
assert "gemini3_flash" in summary
assert summary["gemma4"]["success_rate"] == 1.0
assert summary["gemma4"]["latency"]["mean_ms"] == 100
assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9
def test_all_failures(self):
results = [
{
"image_id": "img1",
"category": "photo",
"gemma4": {"success": False, "error": "API error"},
"gemini3_flash": {"success": False, "error": "API error"},
}
]
summary = aggregate_results(results, MODELS)
assert summary["gemma4"]["success_rate"] == 0
class TestMarkdown:
def test_generates_report(self):
report = {
"generated_at": "2026-04-16T00:00:00",
"config": {
"total_images": 10,
"runs_per_model": 1,
"models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"},
},
"summary": {
"gemma4": {
"success_rate": 0.9,
"latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20},
"tokens": {"mean_total": 500, "total_used": 5000},
"accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5},
},
"gemini3_flash": {
"success_rate": 0.95,
"latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25},
"tokens": {"mean_total": 600, "total_used": 6000},
"accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5},
},
},
"results": [],
}
md = to_markdown(report)
assert "Vision Benchmark Report" in md
assert "Latency Comparison" in md
assert "Accuracy Comparison" in md
assert "Token Usage" in md
assert "Verdict" in md
assert "Gemma 4 27B" in md
def test_empty_report(self):
report = {
"generated_at": "2026-04-16T00:00:00",
"config": {"total_images": 0, "runs_per_model": 1, "models": {}},
"summary": {},
"results": [],
}
md = to_markdown(report)
assert "Vision Benchmark Report" in md
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 4
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
for img in dataset:
assert "id" in img
assert "url" in img
assert "category" in img
assert "expected_keywords" in img
assert "expected_structure" in img
def test_categories_present(self):
dataset = generate_sample_dataset()
categories = {img["category"] for img in dataset}
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
class TestModels:
def test_all_models_defined(self):
assert "gemma4" in MODELS
assert "gemini3_flash" in MODELS
def test_model_structure(self):
for name, config in MODELS.items():
assert "model_id" in config
assert "display_name" in config
assert "provider" in config
class TestPrompts:
def test_prompts_for_categories(self):
assert "screenshot" in EVAL_PROMPTS
assert "diagram" in EVAL_PROMPTS
assert "photo" in EVAL_PROMPTS
assert "ocr" in EVAL_PROMPTS
assert "chart" in EVAL_PROMPTS