benchmarks/dflash_apple_silicon.py

#!/usr/bin/env python3
"""Apple Silicon DFlash planning helpers and CLI (issue #152)."""

from __future__ import annotations

import argparse
import json
import platform
import subprocess
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable, Optional


@dataclass(frozen=True)
class DFlashPair:
    slug: str
    base_model: str
    draft_model: str
    estimated_total_weights_gb: float
    minimum_recommended_memory_gb: float
    draft_sliding_window_size: int = 4096


SUPPORTED_PAIRS: tuple[DFlashPair, ...] = (
    DFlashPair(
        slug="qwen35-4b",
        base_model="Qwen/Qwen3.5-4B",
        draft_model="z-lab/Qwen3.5-4B-DFlash",
        estimated_total_weights_gb=9.68,
        minimum_recommended_memory_gb=16.0,
    ),
    DFlashPair(
        slug="qwen35-9b",
        base_model="Qwen/Qwen3.5-9B",
        draft_model="z-lab/Qwen3.5-9B-DFlash",
        estimated_total_weights_gb=19.93,
        minimum_recommended_memory_gb=28.0,
    ),
)


def detect_total_memory_gb() -> float:
    """Detect total system memory in GiB, rounded to a whole number for planning."""
    system = platform.system()
    if system == "Darwin":
        mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
        return round(mem_bytes / (1024 ** 3), 1)
    if system == "Linux":
        with open("/proc/meminfo", "r", encoding="utf-8") as handle:
            for line in handle:
                if line.startswith("MemTotal:"):
                    mem_kb = int(line.split()[1])
                    return round(mem_kb / (1024 ** 2), 1)
    raise RuntimeError(f"Unsupported platform for memory detection: {system}")


def get_pair(slug: str) -> DFlashPair:
    for pair in SUPPORTED_PAIRS:
        if pair.slug == slug:
            return pair
    raise ValueError(f"Unknown DFlash pair: {slug}")


def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair:
    """Pick the strongest upstream-supported pair likely to fit the machine."""
    if preferred_slug:
        return get_pair(preferred_slug)

    fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb]
    if fitting:
        return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb)
    return SUPPORTED_PAIRS[0]


def build_mlx_benchmark_command(
    pair: DFlashPair,
    *,
    dataset: str = "gsm8k",
    max_samples: int = 128,
    enable_thinking: bool = True,
) -> str:
    """Build the upstream MLX benchmark command from the DFlash README."""
    parts = [
        "python -m dflash.benchmark --backend mlx",
        f"--model {pair.base_model}",
        f"--draft-model {pair.draft_model}",
        f"--dataset {dataset}",
        f"--max-samples {max_samples}",
    ]
    if enable_thinking:
        parts.append("--enable-thinking")
    parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}")
    return " \\\n    ".join(parts)


def build_setup_commands(pair: DFlashPair) -> list[str]:
    return [
        "python3 -m venv .venv-dflash",
        "source .venv-dflash/bin/activate",
        "git clone https://github.com/z-lab/dflash.git",
        "cd dflash",
        "pip install -e .[mlx]",
        build_mlx_benchmark_command(pair),
    ]


def render_report_template(machine_label: str, pair: DFlashPair) -> str:
    command = build_mlx_benchmark_command(pair)
    return f"""# DFlash Apple Silicon Benchmark Report

## Machine
- Label: {machine_label}
- Selected pair: {pair.slug}
- Base model: {pair.base_model}
- Draft model: {pair.draft_model}
- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB

## Setup
```bash
python3 -m venv .venv-dflash
source .venv-dflash/bin/activate
git clone https://github.com/z-lab/dflash.git
cd dflash
pip install -e .[mlx]
{command}
```

## Baseline comparison
Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.

## Results
- Throughput (tok/s):
- Peak memory (GB):
- Notes on acceptance / behavior:

## Verdict
Worth operationalizing locally?
- [ ] Yes
- [ ] No
- [ ] Needs more data

## Recommendation
Explain whether this should become part of the local inference stack.
"""


def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict:
    pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug)
    return {
        "machine_memory_gb": total_memory_gb,
        "selected_pair": asdict(pair),
        "setup_commands": build_setup_commands(pair),
        "benchmark_command": build_mlx_benchmark_command(pair),
        "baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.",
    }


def write_output(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")


def main(argv: Optional[Iterable[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks")
    parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory")
    parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None)
    parser.add_argument("--machine-label", default="Apple Silicon Mac")
    parser.add_argument("--format", choices=["json", "markdown"], default="markdown")
    parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout")
    args = parser.parse_args(list(argv) if argv is not None else None)

    memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb()
    pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair)

    if args.format == "json":
        content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2)
    else:
        content = render_report_template(args.machine_label, pair)

    if args.output:
        write_output(Path(args.output), content)
    else:
        print(content)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
feat: add Apple Silicon DFlash benchmark planner (refs #152) 2026-04-21 22:00:22 -04:00			`#!/usr/bin/env python3`
			`"""Apple Silicon DFlash planning helpers and CLI (issue #152)."""`

			`from __future__ import annotations`

			`import argparse`
			`import json`
			`import platform`
			`import subprocess`
			`from dataclasses import asdict, dataclass`
			`from pathlib import Path`
			`from typing import Iterable, Optional`


			`@dataclass(frozen=True)`
			`class DFlashPair:`
			`slug: str`
			`base_model: str`
			`draft_model: str`
			`estimated_total_weights_gb: float`
			`minimum_recommended_memory_gb: float`
			`draft_sliding_window_size: int = 4096`


			`SUPPORTED_PAIRS: tuple[DFlashPair, ...] = (`
			`DFlashPair(`
			`slug="qwen35-4b",`
			`base_model="Qwen/Qwen3.5-4B",`
			`draft_model="z-lab/Qwen3.5-4B-DFlash",`
			`estimated_total_weights_gb=9.68,`
			`minimum_recommended_memory_gb=16.0,`
			`),`
			`DFlashPair(`
			`slug="qwen35-9b",`
			`base_model="Qwen/Qwen3.5-9B",`
			`draft_model="z-lab/Qwen3.5-9B-DFlash",`
			`estimated_total_weights_gb=19.93,`
			`minimum_recommended_memory_gb=28.0,`
			`),`
			`)`


			`def detect_total_memory_gb() -> float:`
			`"""Detect total system memory in GiB, rounded to a whole number for planning."""`
			`system = platform.system()`
			`if system == "Darwin":`
			`mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())`
			`return round(mem_bytes / (1024 ** 3), 1)`
			`if system == "Linux":`
			`with open("/proc/meminfo", "r", encoding="utf-8") as handle:`
			`for line in handle:`
			`if line.startswith("MemTotal:"):`
			`mem_kb = int(line.split()[1])`
			`return round(mem_kb / (1024 ** 2), 1)`
			`raise RuntimeError(f"Unsupported platform for memory detection: {system}")`


			`def get_pair(slug: str) -> DFlashPair:`
			`for pair in SUPPORTED_PAIRS:`
			`if pair.slug == slug:`
			`return pair`
			`raise ValueError(f"Unknown DFlash pair: {slug}")`


			`def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair:`
			`"""Pick the strongest upstream-supported pair likely to fit the machine."""`
			`if preferred_slug:`
			`return get_pair(preferred_slug)`

			`fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb]`
			`if fitting:`
			`return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb)`
			`return SUPPORTED_PAIRS[0]`


			`def build_mlx_benchmark_command(`
			`pair: DFlashPair,`
			`*,`
			`dataset: str = "gsm8k",`
			`max_samples: int = 128,`
			`enable_thinking: bool = True,`
			`) -> str:`
			`"""Build the upstream MLX benchmark command from the DFlash README."""`
			`parts = [`
			`"python -m dflash.benchmark --backend mlx",`
			`f"--model {pair.base_model}",`
			`f"--draft-model {pair.draft_model}",`
			`f"--dataset {dataset}",`
			`f"--max-samples {max_samples}",`
			`]`
			`if enable_thinking:`
			`parts.append("--enable-thinking")`
			`parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}")`
			`return " \\\n ".join(parts)`


			`def build_setup_commands(pair: DFlashPair) -> list[str]:`
			`return [`
			`"python3 -m venv .venv-dflash",`
			`"source .venv-dflash/bin/activate",`
			`"git clone https://github.com/z-lab/dflash.git",`
			`"cd dflash",`
			`"pip install -e .[mlx]",`
			`build_mlx_benchmark_command(pair),`
			`]`


			`def render_report_template(machine_label: str, pair: DFlashPair) -> str:`
			`command = build_mlx_benchmark_command(pair)`
			`return f"""# DFlash Apple Silicon Benchmark Report`

			`## Machine`
			`- Label: {machine_label}`
			`- Selected pair: {pair.slug}`
			`- Base model: {pair.base_model}`
			`- Draft model: {pair.draft_model}`
			`- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB`

			`## Setup`
			```bash
			`python3 -m venv .venv-dflash`
			`source .venv-dflash/bin/activate`
			`git clone https://github.com/z-lab/dflash.git`
			`cd dflash`
			`pip install -e .[mlx]`
			`{command}`
			```

			`## Baseline comparison`
			`Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.`

			`## Results`
			`- Throughput (tok/s):`
			`- Peak memory (GB):`
			`- Notes on acceptance / behavior:`

			`## Verdict`
			`Worth operationalizing locally?`
			`- [ ] Yes`
			`- [ ] No`
			`- [ ] Needs more data`

			`## Recommendation`
			`Explain whether this should become part of the local inference stack.`
			`"""`


			`def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict:`
			`pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug)`
			`return {`
			`"machine_memory_gb": total_memory_gb,`
			`"selected_pair": asdict(pair),`
			`"setup_commands": build_setup_commands(pair),`
			`"benchmark_command": build_mlx_benchmark_command(pair),`
			`"baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.",`
			`}`


			`def write_output(path: Path, content: str) -> None:`
			`path.parent.mkdir(parents=True, exist_ok=True)`
			`path.write_text(content, encoding="utf-8")`


			`def main(argv: Optional[Iterable[str]] = None) -> int:`
			`parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks")`
			`parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory")`
			`parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None)`
			`parser.add_argument("--machine-label", default="Apple Silicon Mac")`
			`parser.add_argument("--format", choices=["json", "markdown"], default="markdown")`
			`parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout")`
			`args = parser.parse_args(list(argv) if argv is not None else None)`

			`memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb()`
			`pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair)`

			`if args.format == "json":`
			`content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2)`
			`else:`
			`content = render_report_template(args.machine_label, pair)`

			`if args.output:`
			`write_output(Path(args.output), content)`
			`else:`
			`print(content)`
			`return 0`


			`if __name__ == "__main__":`
			`raise SystemExit(main())`