diff --git a/README.md b/README.md index 03a07fe..97642aa 100644 --- a/README.md +++ b/README.md @@ -30,3 +30,4 @@ See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/i ## Docs - [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification +- [DFlash on Apple Silicon](docs/DFLASH_APPLE_SILICON.md) — MLX benchmark planner, setup commands, and report workflow diff --git a/benchmarks/dflash_apple_silicon.py b/benchmarks/dflash_apple_silicon.py new file mode 100644 index 0000000..5b3b89d --- /dev/null +++ b/benchmarks/dflash_apple_silicon.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +"""Apple Silicon DFlash planning helpers and CLI (issue #152).""" + +from __future__ import annotations + +import argparse +import json +import platform +import subprocess +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Iterable, Optional + + +@dataclass(frozen=True) +class DFlashPair: + slug: str + base_model: str + draft_model: str + estimated_total_weights_gb: float + minimum_recommended_memory_gb: float + draft_sliding_window_size: int = 4096 + + +SUPPORTED_PAIRS: tuple[DFlashPair, ...] = ( + DFlashPair( + slug="qwen35-4b", + base_model="Qwen/Qwen3.5-4B", + draft_model="z-lab/Qwen3.5-4B-DFlash", + estimated_total_weights_gb=9.68, + minimum_recommended_memory_gb=16.0, + ), + DFlashPair( + slug="qwen35-9b", + base_model="Qwen/Qwen3.5-9B", + draft_model="z-lab/Qwen3.5-9B-DFlash", + estimated_total_weights_gb=19.93, + minimum_recommended_memory_gb=28.0, + ), +) + + +def detect_total_memory_gb() -> float: + """Detect total system memory in GiB, rounded to a whole number for planning.""" + system = platform.system() + if system == "Darwin": + mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip()) + return round(mem_bytes / (1024 ** 3), 1) + if system == "Linux": + with open("/proc/meminfo", "r", encoding="utf-8") as handle: + for line in handle: + if line.startswith("MemTotal:"): + mem_kb = int(line.split()[1]) + return round(mem_kb / (1024 ** 2), 1) + raise RuntimeError(f"Unsupported platform for memory detection: {system}") + + +def get_pair(slug: str) -> DFlashPair: + for pair in SUPPORTED_PAIRS: + if pair.slug == slug: + return pair + raise ValueError(f"Unknown DFlash pair: {slug}") + + +def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair: + """Pick the strongest upstream-supported pair likely to fit the machine.""" + if preferred_slug: + return get_pair(preferred_slug) + + fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb] + if fitting: + return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb) + return SUPPORTED_PAIRS[0] + + +def build_mlx_benchmark_command( + pair: DFlashPair, + *, + dataset: str = "gsm8k", + max_samples: int = 128, + enable_thinking: bool = True, +) -> str: + """Build the upstream MLX benchmark command from the DFlash README.""" + parts = [ + "python -m dflash.benchmark --backend mlx", + f"--model {pair.base_model}", + f"--draft-model {pair.draft_model}", + f"--dataset {dataset}", + f"--max-samples {max_samples}", + ] + if enable_thinking: + parts.append("--enable-thinking") + parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}") + return " \\\n ".join(parts) + + +def build_setup_commands(pair: DFlashPair) -> list[str]: + return [ + "python3 -m venv .venv-dflash", + "source .venv-dflash/bin/activate", + "git clone https://github.com/z-lab/dflash.git", + "cd dflash", + "pip install -e .[mlx]", + build_mlx_benchmark_command(pair), + ] + + +def render_report_template(machine_label: str, pair: DFlashPair) -> str: + command = build_mlx_benchmark_command(pair) + return f"""# DFlash Apple Silicon Benchmark Report + +## Machine +- Label: {machine_label} +- Selected pair: {pair.slug} +- Base model: {pair.base_model} +- Draft model: {pair.draft_model} +- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB + +## Setup +```bash +python3 -m venv .venv-dflash +source .venv-dflash/bin/activate +git clone https://github.com/z-lab/dflash.git +cd dflash +pip install -e .[mlx] +{command} +``` + +## Baseline comparison +Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set. + +## Results +- Throughput (tok/s): +- Peak memory (GB): +- Notes on acceptance / behavior: + +## Verdict +Worth operationalizing locally? +- [ ] Yes +- [ ] No +- [ ] Needs more data + +## Recommendation +Explain whether this should become part of the local inference stack. +""" + + +def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict: + pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug) + return { + "machine_memory_gb": total_memory_gb, + "selected_pair": asdict(pair), + "setup_commands": build_setup_commands(pair), + "benchmark_command": build_mlx_benchmark_command(pair), + "baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.", + } + + +def write_output(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def main(argv: Optional[Iterable[str]] = None) -> int: + parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks") + parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory") + parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None) + parser.add_argument("--machine-label", default="Apple Silicon Mac") + parser.add_argument("--format", choices=["json", "markdown"], default="markdown") + parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout") + args = parser.parse_args(list(argv) if argv is not None else None) + + memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb() + pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair) + + if args.format == "json": + content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2) + else: + content = render_report_template(args.machine_label, pair) + + if args.output: + write_output(Path(args.output), content) + else: + print(content) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/reports/dflash_m3max_36gb.md b/benchmarks/reports/dflash_m3max_36gb.md new file mode 100644 index 0000000..812a4ab --- /dev/null +++ b/benchmarks/reports/dflash_m3max_36gb.md @@ -0,0 +1,41 @@ +# DFlash Apple Silicon Benchmark Report + +## Machine +- Label: M3 Max 36GB +- Selected pair: qwen35-9b +- Base model: Qwen/Qwen3.5-9B +- Draft model: z-lab/Qwen3.5-9B-DFlash +- Estimated total weight footprint: 19.93 GB + +## Setup +```bash +python3 -m venv .venv-dflash +source .venv-dflash/bin/activate +git clone https://github.com/z-lab/dflash.git +cd dflash +pip install -e .[mlx] +python -m dflash.benchmark --backend mlx \ + --model Qwen/Qwen3.5-9B \ + --draft-model z-lab/Qwen3.5-9B-DFlash \ + --dataset gsm8k \ + --max-samples 128 \ + --enable-thinking \ + --draft-sliding-window-size 4096 +``` + +## Baseline comparison +Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set. + +## Results +- Throughput (tok/s): +- Peak memory (GB): +- Notes on acceptance / behavior: + +## Verdict +Worth operationalizing locally? +- [ ] Yes +- [ ] No +- [ ] Needs more data + +## Recommendation +Explain whether this should become part of the local inference stack. diff --git a/docs/DFLASH_APPLE_SILICON.md b/docs/DFLASH_APPLE_SILICON.md new file mode 100644 index 0000000..be15ef3 --- /dev/null +++ b/docs/DFLASH_APPLE_SILICON.md @@ -0,0 +1,76 @@ +# DFlash on Apple Silicon + +This repo now carries a **Gitea-first benchmark harness** for evaluating whether upstream **DFlash on MLX** is worth adding to the local Apple Silicon inference stack. + +## Why + +The headline `Kimi K2.6 + DFlash` benchmark was measured on `8x MI300X` with huge RAM and ROCm patches. That exact recipe is not a fit for a `36 GB` Apple Silicon Mac. + +What *is* relevant locally is the upstream `z-lab/dflash` MLX path, which can benchmark smaller matched target/draft pairs that fit on Apple Silicon. + +## Current repo entry point + +Use: + +```bash +python3 benchmarks/dflash_apple_silicon.py --machine-label "M3 Max 36GB" +``` + +This prints a benchmark report template with: +- the selected model/draft pair +- exact setup commands +- the upstream MLX benchmark command +- baseline comparison guidance + +Write the template to a file: + +```bash +python3 benchmarks/dflash_apple_silicon.py \ + --machine-label "M3 Max 36GB" \ + --output benchmarks/reports/dflash_m3max_36gb.md +``` + +Emit the underlying plan as JSON: + +```bash +python3 benchmarks/dflash_apple_silicon.py --format json +``` + +## Selection logic + +Today the planner uses two upstream-supported MLX pairs: + +- `qwen35-9b` + - base: `Qwen/Qwen3.5-9B` + - draft: `z-lab/Qwen3.5-9B-DFlash` + - chosen for ~28 GB+ machines +- `qwen35-4b` + - base: `Qwen/Qwen3.5-4B` + - draft: `z-lab/Qwen3.5-4B-DFlash` + - fallback for tighter-memory Macs + +On a `36 GB` Mac, the default recommendation is `qwen35-9b`. + +## Upstream benchmark command + +The harness uses the upstream MLX benchmark syntax from `z-lab/dflash`: + +```bash +python -m dflash.benchmark --backend mlx \ + --model Qwen/Qwen3.5-9B \ + --draft-model z-lab/Qwen3.5-9B-DFlash \ + --dataset gsm8k \ + --max-samples 128 \ + --enable-thinking \ + --draft-sliding-window-size 4096 +``` + +## What remains + +This PR adds the **planner + report template** so the benchmark is reproducible from the repo. +The issue remains open until a real Apple Silicon run lands with: + +- measured throughput +- measured memory +- a baseline comparison against plain MLX or llama.cpp speculative decoding +- a recommendation on whether to operationalize DFlash locally diff --git a/evolution/quant_selector.py b/evolution/quant_selector.py index 73a5442..5b2d94c 100644 --- a/evolution/quant_selector.py +++ b/evolution/quant_selector.py @@ -379,8 +379,8 @@ def select_quant_level( break if chosen is None: - # Nothing fits — pick the most aggressive compression - chosen = QUANT_LEVELS[-1] + # Nothing fits — pick the most aggressive compression, not the q4_0 fallback. + chosen = max(QUANT_LEVELS, key=lambda level: level.compression_ratio) logger.warning(f"No quant level fits in {memory_pool_gb:.1f}GB. Using {chosen.name}.") # Calculate final numbers diff --git a/tests/test_dflash_apple_silicon.py b/tests/test_dflash_apple_silicon.py new file mode 100644 index 0000000..6d781d8 --- /dev/null +++ b/tests/test_dflash_apple_silicon.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +"""Tests for Apple Silicon DFlash benchmark planning helpers (issue #152).""" + +import os +import sys +from unittest.mock import patch + +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +from benchmarks.dflash_apple_silicon import ( # noqa: E402 + build_mlx_benchmark_command, + detect_total_memory_gb, + render_report_template, + select_pair, +) + + +class TestPairSelection: + def test_prefers_qwen35_9b_on_36gb_mac(self): + pair = select_pair(total_memory_gb=36) + assert pair.slug == "qwen35-9b" + assert pair.base_model == "Qwen/Qwen3.5-9B" + assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash" + + def test_falls_back_to_4b_when_memory_is_tight(self): + pair = select_pair(total_memory_gb=20) + assert pair.slug == "qwen35-4b" + assert pair.base_model == "Qwen/Qwen3.5-4B" + + +class TestCommandGeneration: + def test_builds_upstream_mlx_benchmark_command(self): + pair = select_pair(total_memory_gb=36) + command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64) + assert "python -m dflash.benchmark --backend mlx" in command + assert "--model Qwen/Qwen3.5-9B" in command + assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command + assert "--dataset gsm8k" in command + assert "--max-samples 64" in command + assert "--draft-sliding-window-size 4096" in command + + +class TestReportTemplate: + def test_report_template_mentions_baseline_and_verdict(self): + pair = select_pair(total_memory_gb=36) + report = render_report_template(machine_label="M3 Max 36GB", pair=pair) + assert "DFlash Apple Silicon Benchmark Report" in report + assert "M3 Max 36GB" in report + assert "Qwen/Qwen3.5-9B" in report + assert "plain MLX or llama.cpp speculative decoding" in report + assert "Worth operationalizing locally?" in report + + +class TestMemoryDetection: + @patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin") + @patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n") + def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system): + assert detect_total_memory_gb() == 36.0 diff --git a/tests/test_quant_selector.py b/tests/test_quant_selector.py index 5447d14..393cc30 100644 --- a/tests/test_quant_selector.py +++ b/tests/test_quant_selector.py @@ -19,10 +19,11 @@ from evolution.quant_selector import ( class TestQuantLevels: - def test_levels_ordered_by_quality(self): - """Levels should be ordered from best quality to most aggressive.""" - for i in range(len(QUANT_LEVELS) - 1): - assert QUANT_LEVELS[i].bits_per_channel > QUANT_LEVELS[i + 1].bits_per_channel + def test_levels_keep_turboquant_quality_order_with_q4_fallback_last(self): + """TurboQuant levels should lead, with q4_0 reserved as the non-Turbo fallback.""" + names = [level.name for level in QUANT_LEVELS] + assert names[:3] == ["turbo4", "turbo3", "turbo2"] + assert names[-1] == "q4_0" def test_all_levels_have_required_fields(self): for level in QUANT_LEVELS: @@ -148,6 +149,19 @@ class TestSelection: sel = select_quant_level(model_size_gb=16.0, context_length=65536) assert len(sel.warnings) > 0 + def test_falls_back_to_turbo2_when_nothing_fits(self): + with patch("evolution.quant_selector.detect_hardware") as mock_hw: + mock_hw.return_value = HardwareInfo( + total_memory_gb=8, + available_memory_gb=6, + gpu_memory_gb=8, + gpu_name="Tiny GPU", + cpu_cores=4, + detection_method="mock", + ) + sel = select_quant_level(model_size_gb=16.0, context_length=131072) + assert sel.level.name == "turbo2" + def test_reasoning_contains_key_info(self): with patch("evolution.quant_selector.detect_hardware") as mock_hw: mock_hw.return_value = HardwareInfo(