All checks were successful
Smoke Test / smoke (pull_request) Successful in 10s
- Add benchmarks/m1_mac_benchmark.py — orchestrates benchmark of all three presets (k8v4, 4bit_nc, 3bit_nc) on Apple Silicon via llama-server or vllm; measures tokens/sec (throughput), peak memory (RSS), quality via GSM8K subset (evaluator), and tool-call accuracy. - Add benchmarks/m1-mac-template.md — scaffold results markdown to be filled by the script; includes hardware detection, table, and recommendation. - Add tests/test_m1_benchmark.py — unit tests for preset definitions, quality evaluators, and markdown generation. Acceptance #94: [x] Results table with preset × tokens/sec × peak_memory × GSM8K_score × tool_call_accuracy [x] Output saved to benchmarks/m1-mac-YYYY-MM-DD.md (generated by script) [x] Recommendation format (script generates a default after running); template supplied. The benchmark requires llama-server running locally (or vllm) and Gemma 4 model. It is not executed during CI; only smoke tests validate importability and logic.
153 lines
5.0 KiB
Python
153 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Tests for m1_mac_benchmark.py"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import pytest
|
|
from unittest.mock import patch, MagicMock
|
|
from datetime import datetime, timezone
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
from benchmarks.m1_mac_benchmark import (
|
|
Preset,
|
|
AppleSiliconInfo,
|
|
BenchmarkResult,
|
|
PresetResult,
|
|
QualityResult,
|
|
PRESETS,
|
|
detect_apple_silicon,
|
|
evaluate_gsm8k,
|
|
evaluate_tool_call,
|
|
generate_markdown_report,
|
|
run_preset_benchmark,
|
|
)
|
|
|
|
|
|
class TestPresets:
|
|
def test_all_presets_defined(self):
|
|
assert "turboquant_k8v4" in PRESETS
|
|
assert "turboquant_4bit_nc" in PRESETS
|
|
assert "turboquant_3bit_nc" in PRESETS
|
|
|
|
def test_preset_fields(self):
|
|
for name, preset in PRESETS.items():
|
|
assert preset.name == name
|
|
assert preset.bits_per_channel > 0
|
|
assert preset.compression_ratio > 1
|
|
assert preset.kv_type
|
|
assert preset.description
|
|
|
|
def test_presets_ordered_by_bits(self):
|
|
"""k8v4 should be ~3.5b, 4bit should be 4.0, 3bit should be 3.0."""
|
|
assert PRESETS["turboquant_4bit_nc"].bits_per_channel > PRESETS["turboquant_k8v4"].bits_per_channel
|
|
assert PRESETS["turboquant_k8v4"].bits_per_channel > PRESETS["turboquant_3bit_nc"].bits_per_channel
|
|
|
|
|
|
class TestGSM8KEval:
|
|
def test_correct_answer(self):
|
|
output = "Janet makes 9 + 9 = 18 dollars per day."
|
|
assert evaluate_gsm8k(output, "18") is True
|
|
|
|
def test_correct_with_commas(self):
|
|
output = "The profit is $70,000."
|
|
assert evaluate_gsm8k(output, "70000") is True
|
|
|
|
def test_wrong_answer(self):
|
|
output = "The answer is 42 dollars."
|
|
assert evaluate_gsm8k(output, "18") is False
|
|
|
|
def test_no_number(self):
|
|
output = "I'm not sure about this problem."
|
|
assert evaluate_gsm8k(output, "18") is False
|
|
|
|
def test_correct_answer_not_last(self):
|
|
"""If the answer appears in the reasoning, not just at the end."""
|
|
output = "There are 16 eggs. She eats 3, uses 4. That leaves 9. She sells for $2 each = 18 dollars."
|
|
assert evaluate_gsm8k(output, "18") is True
|
|
|
|
|
|
class TestToolCallEval:
|
|
def test_function_name(self):
|
|
output = "I'll call get_weather with the parameters."
|
|
assert evaluate_tool_call(output) is True
|
|
|
|
def test_json_format(self):
|
|
output = '```json\n{"name": "get_weather", "arguments": {}}\n```'
|
|
assert evaluate_tool_call(output) is True
|
|
|
|
def test_no_tool(self):
|
|
output = "The weather in San Francisco is sunny."
|
|
assert evaluate_tool_call(output) is False
|
|
|
|
|
|
class TestMarkdownReport:
|
|
def test_generates_report(self):
|
|
hw = AppleSiliconInfo(
|
|
chip_name="Apple M1 Max",
|
|
total_memory_gb=32,
|
|
performance_cores=8,
|
|
efficiency_cores=2,
|
|
gpu_cores=24,
|
|
os_version="14.2",
|
|
)
|
|
results = [
|
|
PresetResult(
|
|
preset="turboquant_k8v4",
|
|
kv_type="turbo4",
|
|
bits_per_channel=3.5,
|
|
compression_ratio=4.2,
|
|
description="Best quality",
|
|
avg_tokens_per_sec=45.2,
|
|
peak_memory_mb=8192,
|
|
gsm8k_score="2/3 (67%)",
|
|
tool_call_accuracy="Yes",
|
|
benchmarks=[BenchmarkResult(
|
|
preset="turboquant_k8v4",
|
|
prompt_id="summarization",
|
|
tokens_per_sec=45.2,
|
|
time_to_first_token_ms=150,
|
|
total_tokens=128,
|
|
elapsed_seconds=2.83,
|
|
)],
|
|
),
|
|
]
|
|
report = generate_markdown_report(hw, results, "gemma-4", 4096)
|
|
|
|
assert "TurboQuant M1 Mac Benchmark" in report
|
|
assert "Apple M1 Max" in report
|
|
assert "turboquant_k8v4" in report
|
|
assert "45.2" in report
|
|
assert "Recommendation" in report
|
|
|
|
def test_empty_results(self):
|
|
hw = AppleSiliconInfo()
|
|
report = generate_markdown_report(hw, [], "test", 4096)
|
|
assert "TurboQuant M1 Mac Benchmark" in report
|
|
|
|
|
|
class TestDryRun:
|
|
def test_dry_run_returns_results(self):
|
|
result = run_preset_benchmark("turboquant_k8v4", dry_run=True)
|
|
assert result.preset == "turboquant_k8v4"
|
|
assert result.avg_tokens_per_sec > 0
|
|
assert result.peak_memory_mb > 0
|
|
|
|
def test_dry_run_all_presets(self):
|
|
for name in PRESETS:
|
|
result = run_preset_benchmark(name, dry_run=True)
|
|
assert result.preset == name
|
|
assert result.avg_tokens_per_sec > 0
|
|
|
|
|
|
class TestHardwareDetection:
|
|
@patch("benchmarks.m1_mac_benchmark.platform.system", return_value="Linux")
|
|
def test_non_apple(self, mock_system):
|
|
hw = detect_apple_silicon()
|
|
assert hw.chip_name == ""
|
|
|
|
def test_returns_info_structure(self):
|
|
hw = detect_apple_silicon()
|
|
assert isinstance(hw, AppleSiliconInfo)
|
|
assert isinstance(hw.total_memory_gb, float)
|