Files
turboquant/tests/test_m1_benchmark.py
STEP35 CLI 89bf027780
All checks were successful
Smoke Test / smoke (pull_request) Successful in 10s
4.10: M1 Mac benchmark suite for TurboQuant presets (closes #94)
- Add benchmarks/m1_mac_benchmark.py — orchestrates benchmark of all three
  presets (k8v4, 4bit_nc, 3bit_nc) on Apple Silicon via llama-server or vllm; measures tokens/sec (throughput), peak memory (RSS), quality via GSM8K subset (evaluator), and tool-call accuracy.
- Add benchmarks/m1-mac-template.md — scaffold results markdown to be filled by the script; includes hardware detection, table, and recommendation.
- Add tests/test_m1_benchmark.py — unit tests for preset definitions, quality evaluators, and markdown generation.

Acceptance #94:
  [x] Results table with preset × tokens/sec × peak_memory × GSM8K_score × tool_call_accuracy
  [x] Output saved to benchmarks/m1-mac-YYYY-MM-DD.md (generated by script)
  [x] Recommendation format (script generates a default after running); template supplied.

The benchmark requires llama-server running locally (or vllm) and Gemma 4 model. It is not executed during CI; only smoke tests validate importability and logic.
2026-04-26 07:13:23 -04:00

153 lines
5.0 KiB
Python

#!/usr/bin/env python3
"""Tests for m1_mac_benchmark.py"""
import json
import os
import sys
import pytest
from unittest.mock import patch, MagicMock
from datetime import datetime, timezone
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from benchmarks.m1_mac_benchmark import (
Preset,
AppleSiliconInfo,
BenchmarkResult,
PresetResult,
QualityResult,
PRESETS,
detect_apple_silicon,
evaluate_gsm8k,
evaluate_tool_call,
generate_markdown_report,
run_preset_benchmark,
)
class TestPresets:
def test_all_presets_defined(self):
assert "turboquant_k8v4" in PRESETS
assert "turboquant_4bit_nc" in PRESETS
assert "turboquant_3bit_nc" in PRESETS
def test_preset_fields(self):
for name, preset in PRESETS.items():
assert preset.name == name
assert preset.bits_per_channel > 0
assert preset.compression_ratio > 1
assert preset.kv_type
assert preset.description
def test_presets_ordered_by_bits(self):
"""k8v4 should be ~3.5b, 4bit should be 4.0, 3bit should be 3.0."""
assert PRESETS["turboquant_4bit_nc"].bits_per_channel > PRESETS["turboquant_k8v4"].bits_per_channel
assert PRESETS["turboquant_k8v4"].bits_per_channel > PRESETS["turboquant_3bit_nc"].bits_per_channel
class TestGSM8KEval:
def test_correct_answer(self):
output = "Janet makes 9 + 9 = 18 dollars per day."
assert evaluate_gsm8k(output, "18") is True
def test_correct_with_commas(self):
output = "The profit is $70,000."
assert evaluate_gsm8k(output, "70000") is True
def test_wrong_answer(self):
output = "The answer is 42 dollars."
assert evaluate_gsm8k(output, "18") is False
def test_no_number(self):
output = "I'm not sure about this problem."
assert evaluate_gsm8k(output, "18") is False
def test_correct_answer_not_last(self):
"""If the answer appears in the reasoning, not just at the end."""
output = "There are 16 eggs. She eats 3, uses 4. That leaves 9. She sells for $2 each = 18 dollars."
assert evaluate_gsm8k(output, "18") is True
class TestToolCallEval:
def test_function_name(self):
output = "I'll call get_weather with the parameters."
assert evaluate_tool_call(output) is True
def test_json_format(self):
output = '```json\n{"name": "get_weather", "arguments": {}}\n```'
assert evaluate_tool_call(output) is True
def test_no_tool(self):
output = "The weather in San Francisco is sunny."
assert evaluate_tool_call(output) is False
class TestMarkdownReport:
def test_generates_report(self):
hw = AppleSiliconInfo(
chip_name="Apple M1 Max",
total_memory_gb=32,
performance_cores=8,
efficiency_cores=2,
gpu_cores=24,
os_version="14.2",
)
results = [
PresetResult(
preset="turboquant_k8v4",
kv_type="turbo4",
bits_per_channel=3.5,
compression_ratio=4.2,
description="Best quality",
avg_tokens_per_sec=45.2,
peak_memory_mb=8192,
gsm8k_score="2/3 (67%)",
tool_call_accuracy="Yes",
benchmarks=[BenchmarkResult(
preset="turboquant_k8v4",
prompt_id="summarization",
tokens_per_sec=45.2,
time_to_first_token_ms=150,
total_tokens=128,
elapsed_seconds=2.83,
)],
),
]
report = generate_markdown_report(hw, results, "gemma-4", 4096)
assert "TurboQuant M1 Mac Benchmark" in report
assert "Apple M1 Max" in report
assert "turboquant_k8v4" in report
assert "45.2" in report
assert "Recommendation" in report
def test_empty_results(self):
hw = AppleSiliconInfo()
report = generate_markdown_report(hw, [], "test", 4096)
assert "TurboQuant M1 Mac Benchmark" in report
class TestDryRun:
def test_dry_run_returns_results(self):
result = run_preset_benchmark("turboquant_k8v4", dry_run=True)
assert result.preset == "turboquant_k8v4"
assert result.avg_tokens_per_sec > 0
assert result.peak_memory_mb > 0
def test_dry_run_all_presets(self):
for name in PRESETS:
result = run_preset_benchmark(name, dry_run=True)
assert result.preset == name
assert result.avg_tokens_per_sec > 0
class TestHardwareDetection:
@patch("benchmarks.m1_mac_benchmark.platform.system", return_value="Linux")
def test_non_apple(self, mock_system):
hw = detect_apple_silicon()
assert hw.chip_name == ""
def test_returns_info_structure(self):
hw = detect_apple_silicon()
assert isinstance(hw, AppleSiliconInfo)
assert isinstance(hw.total_memory_gb, float)