docs: record Qwen3.5-9B DFlash Metal timeout (refs #152 , #154 )

bench: record Apple Silicon DFlash pilot result (refs #152 )
feat: add Apple Silicon DFlash benchmark planner (refs #152 )
2026-04-21 22:25:25 -04:00 · 2026-04-21 22:20:15 -04:00 · 2026-04-21 22:00:22 -04:00
9 changed files with 539 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -30,3 +30,4 @@ See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/i
 ## Docs
 - [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification
 - [DFlash on Apple Silicon](docs/DFLASH_APPLE_SILICON.md) — MLX benchmark planner, setup commands, and report workflow
--- a/benchmarks/dflash_apple_silicon.py
+++ b/benchmarks/dflash_apple_silicon.py
@@ -0,0 +1,189 @@
 #!/usr/bin/env python3
 """Apple Silicon DFlash planning helpers and CLI (issue #152)."""
 from __future__ import annotations
 import argparse
 import json
 import platform
 import subprocess
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Iterable, Optional
@dataclass(frozen=True)
 class DFlashPair:
    slug: str
    base_model: str
    draft_model: str
    estimated_total_weights_gb: float
    minimum_recommended_memory_gb: float
    draft_sliding_window_size: int = 4096
 SUPPORTED_PAIRS: tuple[DFlashPair, ...] = (
    DFlashPair(
        slug="qwen35-4b",
        base_model="Qwen/Qwen3.5-4B",
        draft_model="z-lab/Qwen3.5-4B-DFlash",
        estimated_total_weights_gb=9.68,
        minimum_recommended_memory_gb=16.0,
    ),
    DFlashPair(
        slug="qwen35-9b",
        base_model="Qwen/Qwen3.5-9B",
        draft_model="z-lab/Qwen3.5-9B-DFlash",
        estimated_total_weights_gb=19.93,
        minimum_recommended_memory_gb=28.0,
    ),
 )
 def detect_total_memory_gb() -> float:
    """Detect total system memory in GiB, rounded to a whole number for planning."""
    system = platform.system()
    if system == "Darwin":
        mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
        return round(mem_bytes / (1024 ** 3), 1)
    if system == "Linux":
        with open("/proc/meminfo", "r", encoding="utf-8") as handle:
            for line in handle:
                if line.startswith("MemTotal:"):
                    mem_kb = int(line.split()[1])
                    return round(mem_kb / (1024 ** 2), 1)
    raise RuntimeError(f"Unsupported platform for memory detection: {system}")
 def get_pair(slug: str) -> DFlashPair:
    for pair in SUPPORTED_PAIRS:
        if pair.slug == slug:
            return pair
    raise ValueError(f"Unknown DFlash pair: {slug}")
 def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair:
    """Pick the strongest upstream-supported pair likely to fit the machine."""
    if preferred_slug:
        return get_pair(preferred_slug)
    fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb]
    if fitting:
        return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb)
    return SUPPORTED_PAIRS[0]
 def build_mlx_benchmark_command(
    pair: DFlashPair,
    *,
    dataset: str = "gsm8k",
    max_samples: int = 128,
    enable_thinking: bool = True,
 ) -> str:
    """Build the upstream MLX benchmark command from the DFlash README."""
    parts = [
        "python -m dflash.benchmark --backend mlx",
        f"--model {pair.base_model}",
        f"--draft-model {pair.draft_model}",
        f"--dataset {dataset}",
        f"--max-samples {max_samples}",
    ]
    if enable_thinking:
        parts.append("--enable-thinking")
    parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}")
    return " \\\n    ".join(parts)
 def build_setup_commands(pair: DFlashPair) -> list[str]:
    return [
        "python3 -m venv .venv-dflash",
        "source .venv-dflash/bin/activate",
        "git clone https://github.com/z-lab/dflash.git",
        "cd dflash",
        "pip install -e .[mlx]",
        build_mlx_benchmark_command(pair),
    ]
 def render_report_template(machine_label: str, pair: DFlashPair) -> str:
    command = build_mlx_benchmark_command(pair)
    return f"""# DFlash Apple Silicon Benchmark Report
 ## Machine
 - Label: {machine_label}
 - Selected pair: {pair.slug}
 - Base model: {pair.base_model}
 - Draft model: {pair.draft_model}
 - Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB
 ## Setup
 ```bash
 python3 -m venv .venv-dflash
 source .venv-dflash/bin/activate
 git clone https://github.com/z-lab/dflash.git
 cd dflash
 pip install -e .[mlx]
 {command}
 ```
 ## Baseline comparison
 Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
 ## Results
 - Throughput (tok/s):
 - Peak memory (GB):
 - Notes on acceptance / behavior:
 ## Verdict
 Worth operationalizing locally?
 - [ ] Yes
 - [ ] No
 - [ ] Needs more data
 ## Recommendation
 Explain whether this should become part of the local inference stack.
 """
 def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict:
    pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug)
    return {
        "machine_memory_gb": total_memory_gb,
        "selected_pair": asdict(pair),
        "setup_commands": build_setup_commands(pair),
        "benchmark_command": build_mlx_benchmark_command(pair),
        "baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.",
    }
 def write_output(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")
 def main(argv: Optional[Iterable[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks")
    parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory")
    parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None)
    parser.add_argument("--machine-label", default="Apple Silicon Mac")
    parser.add_argument("--format", choices=["json", "markdown"], default="markdown")
    parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout")
    args = parser.parse_args(list(argv) if argv is not None else None)
    memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb()
    pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair)
    if args.format == "json":
        content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2)
    else:
        content = render_report_template(args.machine_label, pair)
    if args.output:
        write_output(Path(args.output), content)
    else:
        print(content)
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/benchmarks/reports/dflash_m3max_36gb.md
+++ b/benchmarks/reports/dflash_m3max_36gb.md
@@ -0,0 +1,41 @@
 # DFlash Apple Silicon Benchmark Report
 ## Machine
 - Label: M3 Max 36GB
 - Selected pair: qwen35-9b
 - Base model: Qwen/Qwen3.5-9B
 - Draft model: z-lab/Qwen3.5-9B-DFlash
 - Estimated total weight footprint: 19.93 GB
 ## Setup
 ```bash
 python3 -m venv .venv-dflash
 source .venv-dflash/bin/activate
 git clone https://github.com/z-lab/dflash.git
 cd dflash
 pip install -e .[mlx]
 python -m dflash.benchmark --backend mlx \
    --model Qwen/Qwen3.5-9B \
    --draft-model z-lab/Qwen3.5-9B-DFlash \
    --dataset gsm8k \
    --max-samples 128 \
    --enable-thinking \
    --draft-sliding-window-size 4096
 ```
 ## Baseline comparison
 Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
 ## Results
 - Throughput (tok/s):
 - Peak memory (GB):
 - Notes on acceptance / behavior:
 ## Verdict
 Worth operationalizing locally?
 - [ ] Yes
 - [ ] No
 - [ ] Needs more data
 ## Recommendation
 Explain whether this should become part of the local inference stack.
--- a/benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md
+++ b/benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md
@@ -0,0 +1,46 @@
 # DFlash Apple Silicon Pilot — Qwen3.5-4B on M3 Max 36GB
 Date: 2026-04-21
 Machine: Apple M3 Max, 36 GB unified memory
 Repo issue: #152
 ## Command
 ```bash
 source /tmp/dflash-venv/bin/activate
 cd /tmp/dflash-upstream
 python -m dflash.benchmark --backend mlx \
    --model Qwen/Qwen3.5-4B \
    --draft-model z-lab/Qwen3.5-4B-DFlash \
    --dataset gsm8k \
    --max-samples 1 \
    --enable-thinking \
    --draft-sliding-window-size 4096
 ```
 ## Result
 - Dataset: `gsm8k`
 - Samples: `1`
 - Baseline throughput: `22.35 tok/s`
 - DFlash throughput: `46.78 tok/s`
 - Decoding speedup: `2.09x`
 - Average acceptance length: `6.48`
 Acceptance length histogram:
 ```text
 ['0.3%', '11.1%', '12.7%', '10.4%', '11.7%', '7.6%', '7.0%', '3.8%', '5.1%', '6.3%', '2.8%', '3.8%', '2.2%', '1.9%', '0.9%', '2.5%', '9.8%']
 ```
 ## Caveats
 - This is a **pilot**, not a decision-grade benchmark.
 - Only `1` sample was run, so the throughput number is directional.
 - No apples-to-apples baseline against plain MLX or llama.cpp speculative decoding is included yet.
 - The planner still recommends trying `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` on this machine for the more meaningful fit test.
 ## Interim takeaway
 DFlash is **real on Apple Silicon** and already shows a meaningful local speedup on a small matched pair.
 A `2.09x` pilot speedup on `Qwen3.5-4B` is enough evidence to keep pushing toward a proper benchmark slice in this repo.
--- a/benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md
+++ b/benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md
@@ -0,0 +1,59 @@
 # DFlash on Apple Silicon Failure Report — Qwen3.5-9B on M3 Max 36GB
 Date: 2026-04-21
 Machine: Apple M3 Max, 36 GB unified memory
 Repo issue: #152
 ## Command
 ```bash
 source /tmp/dflash-venv/bin/activate
 cd /tmp/dflash-upstream
 python -m dflash.benchmark --backend mlx \
    --model Qwen/Qwen3.5-9B \
    --draft-model z-lab/Qwen3.5-9B-DFlash \
    --dataset gsm8k \
    --max-samples 1 \
    --enable-thinking \
    --draft-sliding-window-size 4096
 ```
 ## Outcome
 The benchmark did **not** complete successfully on this machine.
 ### Failure signature
 ```text
 libc++abi: terminating due to uncaught exception of type std::runtime_error:
 [METAL] Command buffer execution failed:
 Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
 ```
 Additional shutdown noise:
 ```text
 bash: [11285: 1] tcsetattr: Inappropriate ioctl for device
 resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
 ```
 ## Interpretation
 This is strong evidence that the `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` pair is **not currently stable** on an M3 Max 36GB Mac under the upstream MLX benchmark path, at least with the default settings used here.
 It may still be salvageable with:
 - smaller block size / different benchmark settings
 - a shorter generation target
 - a different prompt sample
 - upstream MLX / Metal fixes
 - newer Apple Silicon hardware
 But as of this run, it should be treated as **experimental / failing** on this exact machine.
 ## Recommendation
 For this Mac, the working local proof path is still:
 - `Qwen/Qwen3.5-4B`
 - `z-lab/Qwen3.5-4B-DFlash`
 Use the 4B pair for reproducible local validation while the 9B Metal timeout is investigated separately.
--- a/docs/DFLASH_APPLE_SILICON.md
+++ b/docs/DFLASH_APPLE_SILICON.md
@@ -0,0 +1,125 @@
 # DFlash on Apple Silicon
 This repo now carries a **Gitea-first benchmark harness** for evaluating whether upstream **DFlash on MLX** is worth adding to the local Apple Silicon inference stack.
 ## Why
 The headline `Kimi K2.6 + DFlash` benchmark was measured on `8x MI300X` with huge RAM and ROCm patches. That exact recipe is not a fit for a `36 GB` Apple Silicon Mac.
 What *is* relevant locally is the upstream `z-lab/dflash` MLX path, which can benchmark smaller matched target/draft pairs that fit on Apple Silicon.
 ## Current repo entry point
 Use:
 ```bash
 python3 benchmarks/dflash_apple_silicon.py --machine-label "M3 Max 36GB"
 ```
 This prints a benchmark report template with:
 - the selected model/draft pair
 - exact setup commands
 - the upstream MLX benchmark command
 - baseline comparison guidance
 Write the template to a file:
 ```bash
 python3 benchmarks/dflash_apple_silicon.py \
  --machine-label "M3 Max 36GB" \
  --output benchmarks/reports/dflash_m3max_36gb.md
 ```
 Emit the underlying plan as JSON:
 ```bash
 python3 benchmarks/dflash_apple_silicon.py --format json
 ```
 ## Selection logic
 Today the planner uses two upstream-supported MLX pairs:
 - `qwen35-9b`
  - base: `Qwen/Qwen3.5-9B`
  - draft: `z-lab/Qwen3.5-9B-DFlash`
  - chosen for ~28 GB+ machines
 - `qwen35-4b`
  - base: `Qwen/Qwen3.5-4B`
  - draft: `z-lab/Qwen3.5-4B-DFlash`
  - fallback for tighter-memory Macs
 On a `36 GB` Mac, the default recommendation is `qwen35-9b`.
 ## Pilot result already landed
 A first live Apple Silicon run has already been captured in:
 - `benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md`
 Pilot command:
 ```bash
 python -m dflash.benchmark --backend mlx \
    --model Qwen/Qwen3.5-4B \
    --draft-model z-lab/Qwen3.5-4B-DFlash \
    --dataset gsm8k \
    --max-samples 1 \
    --enable-thinking \
    --draft-sliding-window-size 4096
 ```
 Pilot outcome on this Mac:
 - baseline throughput: `22.35 tok/s`
 - DFlash throughput: `46.78 tok/s`
 - decoding speedup: `2.09x`
 Treat that as a **directional proof**, not a final decision benchmark. The next step is the fuller comparison slice against plain MLX or llama.cpp speculative decoding.
 ## Known 9B failure on this machine
 A follow-up live run with:
 - `Qwen/Qwen3.5-9B`
 - `z-lab/Qwen3.5-9B-DFlash`
 failed on this same M3 Max 36GB Mac with:
 ```text
 [METAL] Command buffer execution failed:
 Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
 ```
 That failure is recorded in:
 - `benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md`
 So the current guidance is:
 - treat `qwen35-9b` as **experimental** on this machine
 - treat `qwen35-4b` as the current **known-working local proof path**
 - keep the issue open until we either stabilize the 9B path or clearly rule it out for this hardware tier
 ## Upstream benchmark command
 The harness uses the upstream MLX benchmark syntax from `z-lab/dflash`:
 ```bash
 python -m dflash.benchmark --backend mlx \
    --model Qwen/Qwen3.5-9B \
    --draft-model z-lab/Qwen3.5-9B-DFlash \
    --dataset gsm8k \
    --max-samples 128 \
    --enable-thinking \
    --draft-sliding-window-size 4096
 ```
 ## What remains
 This PR adds the **planner + report template** so the benchmark is reproducible from the repo.
 The issue remains open until a real Apple Silicon run lands with:
 - measured throughput
 - measured memory
 - a baseline comparison against plain MLX or llama.cpp speculative decoding
 - a recommendation on whether to operationalize DFlash locally
--- a/evolution/quant_selector.py
+++ b/evolution/quant_selector.py
@@ -379,8 +379,8 @@ def select_quant_level(
            break
    if chosen is None:
-        # Nothing fits — pick the most aggressive compression
+        # Nothing fits — pick the most aggressive compression, not the q4_0 fallback.
-        chosen = QUANT_LEVELS[-1]
+        chosen = max(QUANT_LEVELS, key=lambda level: level.compression_ratio)
        logger.warning(f"No quant level fits in {memory_pool_gb:.1f}GB. Using {chosen.name}.")
    # Calculate final numbers
--- a/tests/test_dflash_apple_silicon.py
+++ b/tests/test_dflash_apple_silicon.py
@@ -0,0 +1,58 @@
 #!/usr/bin/env python3
 """Tests for Apple Silicon DFlash benchmark planning helpers (issue #152)."""
 import os
 import sys
 from unittest.mock import patch
 sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 from benchmarks.dflash_apple_silicon import (  # noqa: E402
    build_mlx_benchmark_command,
    detect_total_memory_gb,
    render_report_template,
    select_pair,
 )
 class TestPairSelection:
    def test_prefers_qwen35_9b_on_36gb_mac(self):
        pair = select_pair(total_memory_gb=36)
        assert pair.slug == "qwen35-9b"
        assert pair.base_model == "Qwen/Qwen3.5-9B"
        assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash"
    def test_falls_back_to_4b_when_memory_is_tight(self):
        pair = select_pair(total_memory_gb=20)
        assert pair.slug == "qwen35-4b"
        assert pair.base_model == "Qwen/Qwen3.5-4B"
 class TestCommandGeneration:
    def test_builds_upstream_mlx_benchmark_command(self):
        pair = select_pair(total_memory_gb=36)
        command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64)
        assert "python -m dflash.benchmark --backend mlx" in command
        assert "--model Qwen/Qwen3.5-9B" in command
        assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command
        assert "--dataset gsm8k" in command
        assert "--max-samples 64" in command
        assert "--draft-sliding-window-size 4096" in command
 class TestReportTemplate:
    def test_report_template_mentions_baseline_and_verdict(self):
        pair = select_pair(total_memory_gb=36)
        report = render_report_template(machine_label="M3 Max 36GB", pair=pair)
        assert "DFlash Apple Silicon Benchmark Report" in report
        assert "M3 Max 36GB" in report
        assert "Qwen/Qwen3.5-9B" in report
        assert "plain MLX or llama.cpp speculative decoding" in report
        assert "Worth operationalizing locally?" in report
 class TestMemoryDetection:
    @patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin")
    @patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n")
    def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system):
        assert detect_total_memory_gb() == 36.0
--- a/tests/test_quant_selector.py
+++ b/tests/test_quant_selector.py
@@ -19,10 +19,11 @@ from evolution.quant_selector import (
 class TestQuantLevels:
-    def test_levels_ordered_by_quality(self):
+    def test_levels_keep_turboquant_quality_order_with_q4_fallback_last(self):
-        """Levels should be ordered from best quality to most aggressive."""
+        """TurboQuant levels should lead, with q4_0 reserved as the non-Turbo fallback."""
-        for i in range(len(QUANT_LEVELS) - 1):
+        names = [level.name for level in QUANT_LEVELS]
-            assert QUANT_LEVELS[i].bits_per_channel > QUANT_LEVELS[i + 1].bits_per_channel
+        assert names[:3] == ["turbo4", "turbo3", "turbo2"]
        assert names[-1] == "q4_0"
    def test_all_levels_have_required_fields(self):
        for level in QUANT_LEVELS:
@@ -148,6 +149,19 @@ class TestSelection:
            sel = select_quant_level(model_size_gb=16.0, context_length=65536)
            assert len(sel.warnings) > 0
    def test_falls_back_to_turbo2_when_nothing_fits(self):
        with patch("evolution.quant_selector.detect_hardware") as mock_hw:
            mock_hw.return_value = HardwareInfo(
                total_memory_gb=8,
                available_memory_gb=6,
                gpu_memory_gb=8,
                gpu_name="Tiny GPU",
                cpu_cores=4,
                detection_method="mock",
            )
            sel = select_quant_level(model_size_gb=16.0, context_length=131072)
            assert sel.level.name == "turbo2"
    def test_reasoning_contains_key_info(self):
        with patch("evolution.quant_selector.detect_hardware") as mock_hw:
            mock_hw.return_value = HardwareInfo(
Author	SHA1	Message	Date
Alexander Whitestone	dabb96d315	docs: record Qwen3.5-9B DFlash Metal timeout (refs #152 , #154 ) All checks were successful Smoke Test / smoke (pull_request) Successful in 19s Details	2026-04-21 22:25:25 -04:00
Alexander Whitestone	69cef8a90f	bench: record Apple Silicon DFlash pilot result (refs #152 ) All checks were successful Smoke Test / smoke (pull_request) Successful in 18s Details	2026-04-21 22:20:15 -04:00
Alexander Whitestone	636d294896	feat: add Apple Silicon DFlash benchmark planner (refs #152 ) All checks were successful Smoke Test / smoke (pull_request) Successful in 18s Details	2026-04-21 22:00:22 -04:00