hermes-agent/tests/test_vision_benchmark.py

"""Tests for vision benchmark suite (Issue #817)."""

import json
import statistics
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))

from vision_benchmark import (
    compute_ocr_accuracy,
    compute_description_completeness,
    compute_structural_accuracy,
    aggregate_results,
    to_markdown,
    generate_sample_dataset,
    MODELS,
    EVAL_PROMPTS,
)


class TestOcrAccuracy:
    def test_perfect_match(self):
        assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0

    def test_empty_ground_truth(self):
        assert compute_ocr_accuracy("", "") == 1.0
        assert compute_ocr_accuracy("text", "") == 0.0

    def test_empty_extraction(self):
        assert compute_ocr_accuracy("", "Hello") == 0.0

    def test_partial_match(self):
        score = compute_ocr_accuracy("Hello Wrld", "Hello World")
        assert 0.5 < score < 1.0

    def test_case_insensitive(self):
        assert compute_ocr_accuracy("hello world", "Hello World") == 1.0

    def test_whitespace_differences(self):
        score = compute_ocr_accuracy("  Hello  World  ", "Hello World")
        assert score >= 0.8


class TestDescriptionCompleteness:
    def test_all_keywords_found(self):
        keywords = ["github", "logo", "octocat"]
        text = "This is the GitHub logo featuring the octocat mascot."
        assert compute_description_completeness(text, keywords) == 1.0

    def test_partial_keywords(self):
        keywords = ["github", "logo", "octocat"]
        text = "This is the GitHub logo."
        score = compute_description_completeness(text, keywords)
        assert 0.3 < score < 0.7

    def test_no_keywords(self):
        keywords = ["github", "logo"]
        text = "Something completely different."
        assert compute_description_completeness(text, keywords) == 0.0

    def test_empty_keywords(self):
        assert compute_description_completeness("any text", []) == 1.0

    def test_empty_text(self):
        assert compute_description_completeness("", ["keyword"]) == 0.0

    def test_case_insensitive(self):
        keywords = ["GitHub", "Logo"]
        text = "The github logo is iconic."
        assert compute_description_completeness(text, keywords) == 1.0


class TestStructuralAccuracy:
    def test_length_score(self):
        text = "A" * 100
        scores = compute_structural_accuracy(text, {"min_length": 50})
        assert scores["length"] == 1.0

    def test_short_text(self):
        text = "Short."
        scores = compute_structural_accuracy(text, {"min_length": 100})
        assert scores["length"] < 1.0

    def test_sentence_count(self):
        text = "First sentence. Second sentence. Third sentence."
        scores = compute_structural_accuracy(text, {"min_sentences": 2})
        assert scores["sentences"] >= 1.0

    def test_no_sentences(self):
        text = "No sentence end"
        scores = compute_structural_accuracy(text, {"min_sentences": 1})
        assert scores["sentences"] == 0.0

    def test_has_numbers_true(self):
        text = "There are 42 items."
        scores = compute_structural_accuracy(text, {"has_numbers": True})
        assert scores["has_numbers"] == 1.0

    def test_has_numbers_false(self):
        text = "No numbers here."
        scores = compute_structural_accuracy(text, {"has_numbers": True})
        assert scores["has_numbers"] == 0.0


class TestAggregateResults:
    def test_basic_aggregation(self):
        results = [
            {
                "image_id": "img1",
                "category": "photo",
                "gemma4": {
                    "success": True,
                    "avg_latency_ms": 100,
                    "avg_tokens": 500,
                    "ocr_accuracy": 0.9,
                    "keyword_completeness": 0.8,
                    "analysis_length": 200,
                },
                "gemini3_flash": {
                    "success": True,
                    "avg_latency_ms": 150,
                    "avg_tokens": 600,
                    "ocr_accuracy": 0.85,
                    "keyword_completeness": 0.75,
                    "analysis_length": 180,
                },
            }
        ]
        models = MODELS
        summary = aggregate_results(results, models)

        assert "gemma4" in summary
        assert "gemini3_flash" in summary
        assert summary["gemma4"]["success_rate"] == 1.0
        assert summary["gemma4"]["latency"]["mean_ms"] == 100
        assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9

    def test_all_failures(self):
        results = [
            {
                "image_id": "img1",
                "category": "photo",
                "gemma4": {"success": False, "error": "API error"},
                "gemini3_flash": {"success": False, "error": "API error"},
            }
        ]
        summary = aggregate_results(results, MODELS)
        assert summary["gemma4"]["success_rate"] == 0


class TestMarkdown:
    def test_generates_report(self):
        report = {
            "generated_at": "2026-04-16T00:00:00",
            "config": {
                "total_images": 10,
                "runs_per_model": 1,
                "models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"},
            },
            "summary": {
                "gemma4": {
                    "success_rate": 0.9,
                    "latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20},
                    "tokens": {"mean_total": 500, "total_used": 5000},
                    "accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5},
                },
                "gemini3_flash": {
                    "success_rate": 0.95,
                    "latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25},
                    "tokens": {"mean_total": 600, "total_used": 6000},
                    "accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5},
                },
            },
            "results": [],
        }
        md = to_markdown(report)
        assert "Vision Benchmark Report" in md
        assert "Latency Comparison" in md
        assert "Accuracy Comparison" in md
        assert "Token Usage" in md
        assert "Verdict" in md
        assert "Gemma 4 27B" in md

    def test_empty_report(self):
        report = {
            "generated_at": "2026-04-16T00:00:00",
            "config": {"total_images": 0, "runs_per_model": 1, "models": {}},
            "summary": {},
            "results": [],
        }
        md = to_markdown(report)
        assert "Vision Benchmark Report" in md


class TestDataset:
    def test_sample_dataset_has_entries(self):
        dataset = generate_sample_dataset()
        assert len(dataset) >= 4

    def test_sample_dataset_structure(self):
        dataset = generate_sample_dataset()
        for img in dataset:
            assert "id" in img
            assert "url" in img
            assert "category" in img
            assert "expected_keywords" in img
            assert "expected_structure" in img

    def test_categories_present(self):
        dataset = generate_sample_dataset()
        categories = {img["category"] for img in dataset}
        assert "screenshot" in categories
        assert "diagram" in categories
        assert "photo" in categories


class TestModels:
    def test_all_models_defined(self):
        assert "gemma4" in MODELS
        assert "gemini3_flash" in MODELS

    def test_model_structure(self):
        for name, config in MODELS.items():
            assert "model_id" in config
            assert "display_name" in config
            assert "provider" in config


class TestPrompts:
    def test_prompts_for_categories(self):
        assert "screenshot" in EVAL_PROMPTS
        assert "diagram" in EVAL_PROMPTS
        assert "photo" in EVAL_PROMPTS
        assert "ocr" in EVAL_PROMPTS
        assert "chart" in EVAL_PROMPTS