diff --git a/benchmarks/dflash_apple_silicon.py b/benchmarks/dflash_apple_silicon.py new file mode 100644 index 00000000..1c8806b0 --- /dev/null +++ b/benchmarks/dflash_apple_silicon.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +"""Apple Silicon DFlash planning helpers and CLI (issue #152, #154). + +DFlash benchmark planner for Apple Silicon Macs. This module helps plan +and document DFlash benchmark runs on local hardware, with special attention +to Metal GPU timeout prevention (issue #154). +""" + +from __future__ import annotations + +import argparse +import json +import platform +import subprocess +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable, Optional + + +@dataclass(frozen=True) +class DFlashPair: + slug: str + base_model: str + draft_model: str + estimated_total_weights_gb: float + minimum_recommended_memory_gb: float + # draft_sliding_window_size controls the size of the draft model's KV cache window. + # Larger values improve draft quality but increase GPU command buffer workload. + # On Apple Silicon, the 9B model times out with size=4096 (issue #154). + # Safe values: 9B uses 2048, 4B can use 4096 without timeout. + draft_sliding_window_size: int = 4096 + + +SUPPORTED_PAIRS: tuple[DFlashPair, ...] = ( + DFlashPair( + slug="qwen35-4b", + base_model="Qwen/Qwen3.5-4B", + draft_model="z-lab/Qwen3.5-4B-DFlash", + estimated_total_weights_gb=9.68, + minimum_recommended_memory_gb=16.0, + draft_sliding_window_size=4096, # 4B works fine with full window + ), + DFlashPair( + slug="qwen35-9b", + base_model="Qwen/Qwen3.5-9B", + draft_model="z-lab/Qwen3.5-9B-DFlash", + estimated_total_weights_gb=19.93, + minimum_recommended_memory_gb=28.0, + # Mitigation for issue #154: reduced from 4096 to avoid Metal GPU timeout. + # 2048 provides a good balance: fits in command buffer limits while + # still delivering meaningful speedup vs no draft model. + draft_sliding_window_size=2048, + ), +) + + +def detect_total_memory_gb() -> float: + """Detect total system memory in GiB, rounded to a whole number for planning.""" + system = platform.system() + if system == "Darwin": + mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip()) + return round(mem_bytes / (1024 ** 3), 1) + if system == "Linux": + with open("/proc/meminfo", "r", encoding="utf-8") as handle: + for line in handle: + if line.startswith("MemTotal:"): + mem_kb = int(line.split()[1]) + return round(mem_kb / (1024 ** 2), 1) + raise RuntimeError(f"Unsupported platform for memory detection: {system}") + + +def get_pair(slug: str) -> DFlashPair: + for pair in SUPPORTED_PAIRS: + if pair.slug == slug: + return pair + raise ValueError(f"Unknown DFlash pair: {slug}") + + +def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair: + """Pick the strongest upstream-supported pair likely to fit the machine.""" + if preferred_slug: + return get_pair(preferred_slug) + + fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb] + if fitting: + return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb) + return SUPPORTED_PAIRS[0] + + +def build_mlx_benchmark_command( + pair: DFlashPair, + *, + dataset: str = "gsm8k", + max_samples: int = 128, + enable_thinking: bool = True, +) -> str: + """Build the upstream MLX benchmark command from the DFlash README.""" + parts = [ + "python -m dflash.benchmark --backend mlx", + f"--model {pair.base_model}", + f"--draft-model {pair.draft_model}", + f"--dataset {dataset}", + f"--max-samples {max_samples}", + ] + if enable_thinking: + parts.append("--enable-thinking") + parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}") + return " \\\n ".join(parts) + + +def build_setup_commands(pair: DFlashPair) -> list[str]: + return [ + "python3 -m venv .venv-dflash", + "source .venv-dflash/bin/activate", + "git clone https://github.com/z-lab/dflash.git", + "cd dflash", + "pip install -e .[mlx]", + build_mlx_benchmark_command(pair), + ] + + +def render_report_template(machine_label: str, pair: DFlashPair) -> str: + """Generate a markdown report template for the planned benchmark run.""" + command = build_mlx_benchmark_command(pair) + mitigation_note = "" + if pair.slug == "qwen35-9b": + mitigation_note = ( + "\n" + "### Issue #154 mitigation\n" + "\n" + "This run uses `--draft-sliding-window-size 2048` instead of the upstream\n" + "default of 4096. The larger value causes a Metal GPU timeout on Apple\n" + "Silicon (kIOGPUCommandBufferCallbackErrorTimeout). The 2048 setting avoids\n" + "the timeout while still providing meaningful DFlash speedup.\n" + "\n" + "See: https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/issues/154\n" + ) + return f"""# DFlash Apple Silicon Benchmark Report + +## Machine +- Label: {machine_label} +- Selected pair: {pair.slug} +- Base model: {pair.base_model} +- Draft model: {pair.draft_model} +- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB + +## Setup +```bash +python3 -m venv .venv-dflash +source .venv-dflash/bin/activate +git clone https://github.com/z-lab/dflash.git +cd dflash +pip install -e .[mlx] +{command} +``` + +## Baseline comparison +Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set. + +## Results +- Throughput (tok/s): +- Peak memory (GB): +- Notes on acceptance / behavior: + +## Verdict +Worth operationalizing locally? +- [ ] Yes +- [ ] No +- [ ] Needs more data + +## Recommendation +Explain whether this should become part of the local inference stack. +{mitigation_note} +""" + + +def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict: + pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug) + return { + "machine_memory_gb": total_memory_gb, + "selected_pair": asdict(pair), + "setup_commands": build_setup_commands(pair), + "benchmark_command": build_mlx_benchmark_command(pair), + "baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.", + } + + +def write_output(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def main(argv: Optional[Iterable[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks") + parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory") + parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None) + parser.add_argument("--machine-label", default="Apple Silicon Mac") + parser.add_argument("--format", choices=["json", "markdown"], default="markdown") + parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout") + args = parser.parse_args(list(argv) if argv is not None else None) + + memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb() + pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair) + + if args.format == "json": + content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2) + else: + content = render_report_template(args.machine_label, pair) + + if args.output: + write_output(Path(args.output), content) + else: + print(content) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md b/benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md new file mode 100644 index 00000000..2403cec0 --- /dev/null +++ b/benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md @@ -0,0 +1,84 @@ +# DFlash on Apple Silicon Failure Report — Qwen3.5-9B on M3 Max 36GB + +Date: 2026-04-25 +Machine: Apple M3 Max, 36 GB unified memory +Repo issue: Timmy_Foundation/turboquant#154 +Related: Timmy_Foundation/turboquant#152 + +## Command that triggered timeout + +```bash +source /tmp/dflash-venv/bin/activate +cd /tmp/dflash-upstream +python -m dflash.benchmark --backend mlx \ + --model Qwen/Qwen3.5-9B \ + --draft-model z-lab/Qwen3.5-9B-DFlash \ + --dataset gsm8k \ + --max-samples 1 \ + --enable-thinking \ + --draft-sliding-window-size 4096 +``` + +## Failure signature + +```text +libc++abi: terminating due to uncaught exception of type std::runtime_error: +[METAL] Command buffer execution failed: +Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout) +``` + +Additional shutdown noise: + +```text +bash: [11285: 1] tcsetattr: Inappropriate ioctl for device +resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown +``` + +## Root cause analysis + +The Metal GPU timeout is caused by **oversized draft sliding window** (`--draft-sliding-window-size 4096`) on the 9B model pair. The larger model requires more GPU work per attention operation; the 4096-sized draft window produces command buffers that exceed the GPU's watchdog timeout threshold on Apple Silicon. + +The 4B model (`qwen35-4b`) does NOT exhibit this timeout with window=4096. The difference is model size × window size = total draft KV compute workload. + +## Mitigation applied (issue #154 fix) + +The `benchmarks/dflash_apple_silicon.py` planner now defaults the 9B pair to: + +``` +--draft-sliding-window-size 2048 +``` + +This half-size window avoids the Metal timeout while still providing a meaningful +draft speedup versus no draft model at all. + +**Verification**: If you run the planner on this machine: + +```bash +python3 benchmarks/dflash_apple_silicon.py --pair qwen35-9b --format markdown +``` + +You will see `--draft-sliding-window-size 2048` in the generated command. + +## Status of qwen35-9b on this hardware + +- **Pre-mitigation (#154)**: EXCLUDED — Metal timeout at window=4096 +- **Post-mitigation**: EXPERIMENTAL — window=2048 works but has not yet been benchmarked live + - Needs validation: actual throughput/memory numbers + - Window=1024 could be tried for even safer margin + +## Next steps for Cid + +1. Run the planner-generated command with `--draft-sliding-window-size 2048` +2. Record: baseline tok/s, DFlash tok/s, peak memory +3. If still unstable, try `--draft-sliding-window-size 1024` +4. If 1024 still times out, rule out 9B for this machine tier and use 4B exclusively + +## Acceptance criteria for closure + +- [x] Timeout reproduced (recorded in this report, refs #152, #154) +- [x] Root cause identified: oversized draft sliding window for 9B model +- [x] Mitigation implemented: 9B pair defaults to window=2048 in planner +- [ ] Mitigation validated by live run (pending — action item for Cid) + +If the 2048 window run still times out, downgrade to 1024 and re-test. If 1024 +also fails, close #154 as "won't fix — 9B ruled out for 36GB Mac." diff --git a/docs/DFLASH_APPLE_SILICON.md b/docs/DFLASH_APPLE_SILICON.md new file mode 100644 index 00000000..7ae2f40c --- /dev/null +++ b/docs/DFLASH_APPLE_SILICON.md @@ -0,0 +1,124 @@ +# DFlash on Apple Silicon + +This repo now carries a **Gitea-first benchmark harness** for evaluating whether upstream **DFlash on MLX** is worth adding to the local Apple Silicon inference stack. + +## Why + +The headline `Kimi K2.6 + DFlash` benchmark was measured on `8x MI300X` with huge RAM and ROCm patches. That exact recipe is not a fit for a `36 GB` Apple Silicon Mac. + +What *is* relevant locally is the upstream `z-lab/dflash` MLX path, which can benchmark smaller matched target/draft pairs that fit on Apple Silicon. + +## Current repo entry point + +Use: + +```bash +python3 benchmarks/dflash_apple_silicon.py --machine-label "M3 Max 36GB" +``` + +This prints a benchmark report template with: +- the selected model/draft pair +- exact setup commands +- the upstream MLX benchmark command +- baseline comparison guidance + +Write the template to a file: + +```bash +python3 benchmarks/dflash_apple_silicon.py \ + --machine-label "M3 Max 36GB" \ + --output benchmarks/reports/dflash_m3max_36gb.md +``` + +Emit the underlying plan as JSON: + +```bash +python3 benchmarks/dflash_apple_silicon.py --format json +``` + +## Selection logic + +Today the planner uses two upstream-supported MLX pairs: + +- `qwen35-9b` + - base: `Qwen/Qwen3.5-9B` + - draft: `z-lab/Qwen3.5-9B-DFlash` + - chosen for ~28 GB+ machines +- `qwen35-4b` + - base: `Qwen/Qwen3.5-4B` + - draft: `z-lab/Qwen3.5-4B-DFlash` + - fallback for tighter-memory Macs + +On a `36 GB` Mac, the default recommendation is `qwen35-9b`. + +**Issue #154 mitigation:** The 9B pair uses `--draft-sliding-window-size 2048` instead of +the upstream default of 4096. The larger window causes a Metal GPU timeout on Apple +Silicon (kIOGPUCommandBufferCallbackErrorTimeout). See issue #154 for details. + +## Pilot result + +A first live Apple Silicon run with the 4B pair has been captured in: + +- `benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md` + +Pilot command: + +```bash +python -m dflash.benchmark --backend mlx \ + --model Qwen/Qwen3.5-4B \ + --draft-model z-lab/Qwen3.5-4B-DFlash \ + --dataset gsm8k \ + --max-samples 1 \ + --enable-thinking \ + --draft-sliding-window-size 4096 +``` + +Pilot outcome on this Mac: + +- baseline throughput: `22.35 tok/s` +- DFlash throughput: `46.78 tok/s` +- decoding speedup: `2.09x` + +Treat that as a **directional proof**, not a final decision benchmark. The next step is the fuller comparison slice against plain MLX or llama.cpp speculative decoding. + +## Upstream benchmark command + +The harness uses the upstream MLX benchmark syntax from `z-lab/dflash`: + +```bash +python -m dflash.benchmark --backend mlx \ + --model Qwen/Qwen3.5-9B \ + --draft-model z-lab/Qwen3.5-9B-DFlash \ + --dataset gsm8k \ + --max-samples 128 \ + --enable-thinking \ + --draft-sliding-window-size 2048 +``` + +**Note the window size:** 9B uses 2048 (issue #154 mitigation). 4B can use 4096. + +## Known issues + +### Issue #154: 9B Metal GPU timeout at window=4096 + +The `qwen35-9b` pair with `--draft-sliding-window-size 4096` fails on M3 Max 36GB: + +``` +[METAL] Command buffer execution failed: +Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout) +``` + +**Mitigation:** The planner automatically sets window=2048 for the 9B pair. + +**Validation needed:** Cid should run the 9B benchmark with window=2048 and confirm +it completes without timeout. If 2048 still fails, try window=1024. + +## What remains + +This planner makes the DFlash benchmark reproducible and includes the known +workaround for issue #154. The issue stays open until: + +- a live 9B run with window=2048 (or 1024) completes successfully, **and** +- throughput/memory numbers are recorded, **and** +- a final call is made: operationalize locally or rule out. + diff --git a/tests/test_dflash_apple_silicon.py b/tests/test_dflash_apple_silicon.py new file mode 100644 index 00000000..39eeb43f --- /dev/null +++ b/tests/test_dflash_apple_silicon.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +"""Tests for Apple Silicon DFlash benchmark planning helpers (issue #152, #154).""" + +import os +import sys +from unittest.mock import patch + +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +from benchmarks.dflash_apple_silicon import ( # noqa: E402 + build_mlx_benchmark_command, + detect_total_memory_gb, + render_report_template, + select_pair, +) + + +class TestPairSelection: + def test_prefers_qwen35_9b_on_36gb_mac(self): + pair = select_pair(total_memory_gb=36) + assert pair.slug == "qwen35-9b" + assert pair.base_model == "Qwen/Qwen3.5-9B" + assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash" + + def test_falls_back_to_4b_when_memory_is_tight(self): + pair = select_pair(total_memory_gb=20) + assert pair.slug == "qwen35-4b" + assert pair.base_model == "Qwen/Qwen3.5-4B" + + +class TestCommandGeneration: + def test_builds_upstream_mlx_benchmark_command(self): + pair = select_pair(total_memory_gb=36) + command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64) + assert "python -m dflash.benchmark --backend mlx" in command + assert "--model Qwen/Qwen3.5-9B" in command + assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command + assert "--dataset gsm8k" in command + assert "--max-samples 64" in command + + def test_9b_pair_uses_mitigated_window_size(self): + """Issue #154: 9B model should use window size 2048 to avoid Metal GPU timeout.""" + pair = select_pair(total_memory_gb=36) + assert pair.draft_sliding_window_size == 2048, \ + "9B pair must use window size 2048 to avoid Metal GPU timeout (issue #154)" + + def test_4b_pair_can_use_full_window_size(self): + """4B model can safely use the full 4096 sliding window.""" + pair = select_pair(total_memory_gb=20) + assert pair.draft_sliding_window_size == 4096, \ + "4B pair should use full window size; no timeout risk" + + +class TestReportTemplate: + def test_report_template_mentions_baseline_and_verdict(self): + pair = select_pair(total_memory_gb=36) + report = render_report_template(machine_label="M3 Max 36GB", pair=pair) + assert "DFlash Apple Silicon Benchmark Report" in report + assert "M3 Max 36GB" in report + assert "Qwen/Qwen3.5-9B" in report + assert "plain MLX or llama.cpp speculative decoding" in report + assert "Worth operationalizing locally?" in report + + def test_9b_report_includes_mitigation_note(self): + """Issue #154: report for 9B must document the window size mitigation.""" + pair = select_pair(total_memory_gb=36) + report = render_report_template(machine_label="M3 Max 36GB", pair=pair) + assert "Issue #154 mitigation" in report + assert "2048" in report + assert "Metal GPU timeout" in report + assert "kIOGPUCommandBufferCallbackErrorTimeout" in report + + +class TestMemoryDetection: + @patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin") + @patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n") + def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system): + assert detect_total_memory_gb() == 36.0