#!/usr/bin/env python3 """Tests for Apple Silicon DFlash benchmark planning helpers (issue #152, #154).""" import os import sys from unittest.mock import patch sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from benchmarks.dflash_apple_silicon import ( # noqa: E402 build_mlx_benchmark_command, detect_total_memory_gb, render_report_template, select_pair, ) class TestPairSelection: def test_prefers_qwen35_9b_on_36gb_mac(self): pair = select_pair(total_memory_gb=36) assert pair.slug == "qwen35-9b" assert pair.base_model == "Qwen/Qwen3.5-9B" assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash" def test_falls_back_to_4b_when_memory_is_tight(self): pair = select_pair(total_memory_gb=20) assert pair.slug == "qwen35-4b" assert pair.base_model == "Qwen/Qwen3.5-4B" class TestCommandGeneration: def test_builds_upstream_mlx_benchmark_command(self): pair = select_pair(total_memory_gb=36) command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64) assert "python -m dflash.benchmark --backend mlx" in command assert "--model Qwen/Qwen3.5-9B" in command assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command assert "--dataset gsm8k" in command assert "--max-samples 64" in command def test_9b_pair_uses_mitigated_window_size(self): """Issue #154: 9B model should use window size 2048 to avoid Metal GPU timeout.""" pair = select_pair(total_memory_gb=36) assert pair.draft_sliding_window_size == 2048, \ "9B pair must use window size 2048 to avoid Metal GPU timeout (issue #154)" def test_4b_pair_can_use_full_window_size(self): """4B model can safely use the full 4096 sliding window.""" pair = select_pair(total_memory_gb=20) assert pair.draft_sliding_window_size == 4096, \ "4B pair should use full window size; no timeout risk" class TestReportTemplate: def test_report_template_mentions_baseline_and_verdict(self): pair = select_pair(total_memory_gb=36) report = render_report_template(machine_label="M3 Max 36GB", pair=pair) assert "DFlash Apple Silicon Benchmark Report" in report assert "M3 Max 36GB" in report assert "Qwen/Qwen3.5-9B" in report assert "plain MLX or llama.cpp speculative decoding" in report assert "Worth operationalizing locally?" in report def test_9b_report_includes_mitigation_note(self): """Issue #154: report for 9B must document the window size mitigation.""" pair = select_pair(total_memory_gb=36) report = render_report_template(machine_label="M3 Max 36GB", pair=pair) assert "Issue #154 mitigation" in report assert "2048" in report assert "Metal GPU timeout" in report assert "kIOGPUCommandBufferCallbackErrorTimeout" in report class TestMemoryDetection: @patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin") @patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n") def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system): assert detect_total_memory_gb() == 36.0