turboquant/tests/test_m1_benchmark.py

#!/usr/bin/env python3
"""Tests for m1_mac_benchmark.py"""

import json
import os
import sys
import pytest
from unittest.mock import patch, MagicMock
from datetime import datetime, timezone

sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from benchmarks.m1_mac_benchmark import (
    Preset,
    AppleSiliconInfo,
    BenchmarkResult,
    PresetResult,
    QualityResult,
    PRESETS,
    detect_apple_silicon,
    evaluate_gsm8k,
    evaluate_tool_call,
    generate_markdown_report,
    run_preset_benchmark,
)


class TestPresets:
    def test_all_presets_defined(self):
        assert "turboquant_k8v4" in PRESETS
        assert "turboquant_4bit_nc" in PRESETS
        assert "turboquant_3bit_nc" in PRESETS

    def test_preset_fields(self):
        for name, preset in PRESETS.items():
            assert preset.name == name
            assert preset.bits_per_channel > 0
            assert preset.compression_ratio > 1
            assert preset.kv_type
            assert preset.description

    def test_presets_ordered_by_bits(self):
        """k8v4 should be ~3.5b, 4bit should be 4.0, 3bit should be 3.0."""
        assert PRESETS["turboquant_4bit_nc"].bits_per_channel > PRESETS["turboquant_k8v4"].bits_per_channel
        assert PRESETS["turboquant_k8v4"].bits_per_channel > PRESETS["turboquant_3bit_nc"].bits_per_channel


class TestGSM8KEval:
    def test_correct_answer(self):
        output = "Janet makes 9 + 9 = 18 dollars per day."
        assert evaluate_gsm8k(output, "18") is True

    def test_correct_with_commas(self):
        output = "The profit is $70,000."
        assert evaluate_gsm8k(output, "70000") is True

    def test_wrong_answer(self):
        output = "The answer is 42 dollars."
        assert evaluate_gsm8k(output, "18") is False

    def test_no_number(self):
        output = "I'm not sure about this problem."
        assert evaluate_gsm8k(output, "18") is False

    def test_correct_answer_not_last(self):
        """If the answer appears in the reasoning, not just at the end."""
        output = "There are 16 eggs. She eats 3, uses 4. That leaves 9. She sells for $2 each = 18 dollars."
        assert evaluate_gsm8k(output, "18") is True


class TestToolCallEval:
    def test_function_name(self):
        output = "I'll call get_weather with the parameters."
        assert evaluate_tool_call(output) is True

    def test_json_format(self):
        output = '```json\n{"name": "get_weather", "arguments": {}}\n```'
        assert evaluate_tool_call(output) is True

    def test_no_tool(self):
        output = "The weather in San Francisco is sunny."
        assert evaluate_tool_call(output) is False


class TestMarkdownReport:
    def test_generates_report(self):
        hw = AppleSiliconInfo(
            chip_name="Apple M1 Max",
            total_memory_gb=32,
            performance_cores=8,
            efficiency_cores=2,
            gpu_cores=24,
            os_version="14.2",
        )
        results = [
            PresetResult(
                preset="turboquant_k8v4",
                kv_type="turbo4",
                bits_per_channel=3.5,
                compression_ratio=4.2,
                description="Best quality",
                avg_tokens_per_sec=45.2,
                peak_memory_mb=8192,
                gsm8k_score="2/3 (67%)",
                tool_call_accuracy="Yes",
                benchmarks=[BenchmarkResult(
                    preset="turboquant_k8v4",
                    prompt_id="summarization",
                    tokens_per_sec=45.2,
                    time_to_first_token_ms=150,
                    total_tokens=128,
                    elapsed_seconds=2.83,
                )],
            ),
        ]
        report = generate_markdown_report(hw, results, "gemma-4", 4096)

        assert "TurboQuant M1 Mac Benchmark" in report
        assert "Apple M1 Max" in report
        assert "turboquant_k8v4" in report
        assert "45.2" in report
        assert "Recommendation" in report

    def test_empty_results(self):
        hw = AppleSiliconInfo()
        report = generate_markdown_report(hw, [], "test", 4096)
        assert "TurboQuant M1 Mac Benchmark" in report


class TestDryRun:
    def test_dry_run_returns_results(self):
        result = run_preset_benchmark("turboquant_k8v4", dry_run=True)
        assert result.preset == "turboquant_k8v4"
        assert result.avg_tokens_per_sec > 0
        assert result.peak_memory_mb > 0

    def test_dry_run_all_presets(self):
        for name in PRESETS:
            result = run_preset_benchmark(name, dry_run=True)
            assert result.preset == name
            assert result.avg_tokens_per_sec > 0


class TestHardwareDetection:
    @patch("benchmarks.m1_mac_benchmark.platform.system", return_value="Linux")
    def test_non_apple(self, mock_system):
        hw = detect_apple_silicon()
        assert hw.chip_name == ""

    def test_returns_info_structure(self):
        hw = detect_apple_silicon()
        assert isinstance(hw, AppleSiliconInfo)
        assert isinstance(hw.total_memory_gb, float)