Files
turboquant/tests/test_dflash_apple_silicon.py
Alexander Payne 704d284d14
All checks were successful
Smoke Test / smoke (pull_request) Successful in 10s
fix: mitigate MLX Metal GPU timeout for qwen35-9b (issue #154)
The DFlash benchmark with --draft-sliding-window-size 4096 on the 9B model
causes a Metal GPU timeout on Apple Silicon (kIOGPUCommandBufferCallbackErrorTimeout).

Root cause: the 9B model's larger compute workload combined with a 4096-size
draft sliding window produces GPU command buffers that exceed the watchdog
timeout. The 4B model does not exhibit this problem.

Mitigation: lower the default draft sliding window for the 9B pair from 4096
to 2048. This avoids the timeout while still providing meaningful speedup.

Changes:
- Add benchmarks/dflash_apple_silicon.py (DFlash benchmark planner)
  - 9B pair now uses draft_sliding_window_size=2048
  - 4B pair retains draft_sliding_window_size=4096
- Add tests/test_dflash_apple_silicon.py with #154-specific test
- Add docs/DFLASH_APPLE_SILICON.md documenting the mitigation
- Add benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md recording failure

Verification: pytest -q tests/test_dflash_apple_silicon.py
Test explicitly asserts 9B uses window=2048 to prevent timeout regression.

Closes #154
2026-04-25 20:04:55 -04:00

79 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Tests for Apple Silicon DFlash benchmark planning helpers (issue #152, #154)."""
import os
import sys
from unittest.mock import patch
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from benchmarks.dflash_apple_silicon import ( # noqa: E402
build_mlx_benchmark_command,
detect_total_memory_gb,
render_report_template,
select_pair,
)
class TestPairSelection:
def test_prefers_qwen35_9b_on_36gb_mac(self):
pair = select_pair(total_memory_gb=36)
assert pair.slug == "qwen35-9b"
assert pair.base_model == "Qwen/Qwen3.5-9B"
assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash"
def test_falls_back_to_4b_when_memory_is_tight(self):
pair = select_pair(total_memory_gb=20)
assert pair.slug == "qwen35-4b"
assert pair.base_model == "Qwen/Qwen3.5-4B"
class TestCommandGeneration:
def test_builds_upstream_mlx_benchmark_command(self):
pair = select_pair(total_memory_gb=36)
command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64)
assert "python -m dflash.benchmark --backend mlx" in command
assert "--model Qwen/Qwen3.5-9B" in command
assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command
assert "--dataset gsm8k" in command
assert "--max-samples 64" in command
def test_9b_pair_uses_mitigated_window_size(self):
"""Issue #154: 9B model should use window size 2048 to avoid Metal GPU timeout."""
pair = select_pair(total_memory_gb=36)
assert pair.draft_sliding_window_size == 2048, \
"9B pair must use window size 2048 to avoid Metal GPU timeout (issue #154)"
def test_4b_pair_can_use_full_window_size(self):
"""4B model can safely use the full 4096 sliding window."""
pair = select_pair(total_memory_gb=20)
assert pair.draft_sliding_window_size == 4096, \
"4B pair should use full window size; no timeout risk"
class TestReportTemplate:
def test_report_template_mentions_baseline_and_verdict(self):
pair = select_pair(total_memory_gb=36)
report = render_report_template(machine_label="M3 Max 36GB", pair=pair)
assert "DFlash Apple Silicon Benchmark Report" in report
assert "M3 Max 36GB" in report
assert "Qwen/Qwen3.5-9B" in report
assert "plain MLX or llama.cpp speculative decoding" in report
assert "Worth operationalizing locally?" in report
def test_9b_report_includes_mitigation_note(self):
"""Issue #154: report for 9B must document the window size mitigation."""
pair = select_pair(total_memory_gb=36)
report = render_report_template(machine_label="M3 Max 36GB", pair=pair)
assert "Issue #154 mitigation" in report
assert "2048" in report
assert "Metal GPU timeout" in report
assert "kIOGPUCommandBufferCallbackErrorTimeout" in report
class TestMemoryDetection:
@patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin")
@patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n")
def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system):
assert detect_total_memory_gb() == 36.0