Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Whitestone
dabb96d315 docs: record Qwen3.5-9B DFlash Metal timeout (refs #152, #154)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 19s
2026-04-21 22:25:25 -04:00
Alexander Whitestone
69cef8a90f bench: record Apple Silicon DFlash pilot result (refs #152)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 18s
2026-04-21 22:20:15 -04:00
Alexander Whitestone
636d294896 feat: add Apple Silicon DFlash benchmark planner (refs #152)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 18s
2026-04-21 22:00:22 -04:00
18 changed files with 544 additions and 869 deletions

View File

@@ -18,17 +18,7 @@ jobs:
find . -name '*.py' | grep -v llama-cpp-fork | xargs -r python3 -m py_compile
find . -name '*.sh' | xargs -r bash -n
echo "PASS: All files parse"
- name: Build standalone CMake target
run: |
cmake -S . -B build -DTURBOQUANT_BUILD_TESTS=ON
cmake --build build -j$(nproc)
- name: Run tests
run: |
ctest --test-dir build --output-on-failure
- name: Secret scan
run: |
if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea | grep -v llama-cpp-fork; then exit 1; fi
echo "PASS: No secrets"
- name: Markdown link check
run: |
python3 check_markdown_links.py

View File

@@ -30,3 +30,4 @@ See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/i
## Docs
- [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification
- [DFlash on Apple Silicon](docs/DFLASH_APPLE_SILICON.md) — MLX benchmark planner, setup commands, and report workflow

View File

@@ -1,332 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Benchmark Comparison (Issue #29).
Runs multiple inference configurations and produces a side-by-side
comparison table with TTFT, tokens/sec, and peak memory.
Configurations (default):
1. Ollama gemma4 (baseline)
2. llama-server gemma4 f16 KV
3. llama-server gemma4 turbo4 KV
4. llama-server gemma4 turbo4 + layer-adaptive
Usage:
python3 benchmarks/compare_configs.py --help
python3 benchmarks/compare_configs.py --config benchmarks/configs.json
python3 benchmarks/compare_configs.py --demo
"""
import argparse
import json
import os
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure we can import sibling run_benchmarks
sys.path.insert(0, str(Path(__file__).resolve().parent))
try:
from run_benchmarks import (
run_ollama,
run_llama_server,
get_peak_memory_mb,
)
except ImportError:
# Fallback stubs when run_benchmarks (and requests) are unavailable
def run_ollama(prompt, model, url, timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def run_llama_server(prompt, model, url, kv_type="f16", timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def get_peak_memory_mb(): # type: ignore
return 0.0
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class ConfigEntry:
"""One inference configuration to benchmark."""
name: str
backend: str # "ollama" | "llama-server"
model: str
url: str
kv_type: str = "f16"
layer_adaptive: bool = False
env: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class ConfigResult:
"""Aggregated results for a single configuration."""
config_name: str
backend: str
model: str
kv_type: str
total_prompts: int
success: int
failed: int
avg_ttft_s: Optional[float]
avg_tok_per_sec: float
avg_latency_s: float
peak_memory_mb: float
winner: bool = False
def to_dict(self) -> dict:
return asdict(self)
# ---------------------------------------------------------------------------
# Default configurations
# ---------------------------------------------------------------------------
DEFAULT_CONFIGS: list[ConfigEntry] = [
ConfigEntry(name="ollama-gemma4", backend="ollama", model="gemma4",
url="http://localhost:11434", kv_type="default"),
ConfigEntry(name="llama-f16", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="f16"),
ConfigEntry(name="llama-turbo4", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="turbo4"),
ConfigEntry(name="llama-turbo4-adaptive", backend="llama-server",
model="gemma4", url="http://localhost:8081",
kv_type="turbo4", layer_adaptive=True),
]
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def load_prompts(prompts_file: str) -> list[dict]:
"""Load test prompts from JSON file."""
with open(prompts_file) as f:
return json.load(f)
def run_config(config: ConfigEntry, prompts: list[dict], timeout: int = 120) -> list[dict]:
"""Run all prompts against a single configuration, return per-prompt results."""
results = []
env_overrides = {**os.environ, **config.env}
if config.layer_adaptive:
env_overrides.setdefault("TURBO_LAYER_ADAPTIVE", "7")
for item in prompts:
if config.backend == "ollama":
result = run_ollama(item["prompt"], config.model, config.url, timeout)
else:
result = run_llama_server(item["prompt"], config.model, config.url,
kv_type=config.kv_type, timeout=timeout)
result["id"] = item.get("id", item.get("category", "unknown"))
result["prompt_preview"] = item["prompt"][:120]
results.append(result)
return results
def aggregate(results: list[dict], config: ConfigEntry, peak_mb: float) -> ConfigResult:
"""Aggregate per-prompt results into a ConfigResult."""
successes = [r for r in results if r.get("status") == "success"]
ttfts = [r["ttft_s"] for r in successes if r.get("ttft_s") is not None]
tps = [r["tokens_per_sec"] for r in successes if r.get("tokens_per_sec")]
lats = [r["latency_s"] for r in successes]
return ConfigResult(
config_name=config.name,
backend=config.backend,
model=config.model,
kv_type=config.kv_type,
total_prompts=len(results),
success=len(successes),
failed=len(results) - len(successes),
avg_ttft_s=round(sum(ttfts) / len(ttfts), 3) if ttfts else None,
avg_tok_per_sec=round(sum(tps) / len(tps), 2) if tps else 0.0,
avg_latency_s=round(sum(lats) / len(lats), 3) if lats else 0.0,
peak_memory_mb=peak_mb,
)
def build_comparison_table(aggregated: list[ConfigResult]) -> str:
"""Build a human-readable comparison table."""
lines = []
header = f"{'Config':<28} {'TTFT':<8} {'tok/s':<10} {'lat(s)':<8} {'mem(MB)':<9} {'ok/n':<6}"
lines.append(header)
lines.append("-" * len(header))
for r in aggregated:
marker = " <- WINNER" if r.winner else ""
ttft = f"{r.avg_ttft_s:.3f}" if r.avg_ttft_s is not None else "N/A"
lines.append(
f"{r.config_name:<28} {ttft:<8} {r.avg_tok_per_sec:<10.2f} "
f"{r.avg_latency_s:<8.3f} {r.peak_memory_mb:<9.1f} "
f"{r.success}/{r.total_prompts}{marker}"
)
return "\n".join(lines)
def pick_winner(aggregated: list[ConfigResult]) -> ConfigResult:
"""Choose the winner: highest tokens/sec among successful configs."""
candidates = [r for r in aggregated if r.success > 0]
if not candidates:
return aggregated[0] if aggregated else ConfigResult(
config_name="none", backend="", model="", kv_type="",
total_prompts=0, success=0, failed=0,
avg_ttft_s=None, avg_tok_per_sec=0.0, avg_latency_s=0.0,
peak_memory_mb=0.0,
)
winner = max(candidates, key=lambda r: r.avg_tok_per_sec)
winner.winner = True
return winner
def run_comparison(configs: list[ConfigEntry], prompts: list[dict],
output_file: Optional[str] = None,
timeout: int = 120) -> dict:
"""Run full comparison and return structured report."""
all_results: list[ConfigResult] = []
for cfg in configs:
print(f"\n--- {cfg.name} ({cfg.backend}/{cfg.kv_type}) ---")
per_prompt = run_config(cfg, prompts, timeout)
peak_mb = get_peak_memory_mb()
agg = aggregate(per_prompt, cfg, peak_mb)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": len(prompts),
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# Demo mode (no live servers required)
# ---------------------------------------------------------------------------
def run_demo(output_file: Optional[str] = None) -> dict:
"""Generate synthetic benchmark results for testing."""
import random
random.seed(42)
# Simulated performance baselines
baselines = {
"ollama-gemma4": {"ttft": 0.85, "tps": 18.2, "mem": 2200},
"llama-f16": {"ttft": 0.72, "tps": 22.1, "mem": 2400},
"llama-turbo4": {"ttft": 0.68, "tps": 19.8, "mem": 850},
"llama-turbo4-adaptive": {"ttft": 0.65, "tps": 20.5, "mem": 820},
}
all_results: list[ConfigResult] = []
for cfg in DEFAULT_CONFIGS:
bl = baselines[cfg.name]
prompt_count = 10
ttft = bl["ttft"] + random.gauss(0, 0.02)
tps = bl["tps"] + random.gauss(0, 0.5)
lat = (ttft + 512 / tps) + random.gauss(0, 0.1)
agg = ConfigResult(
config_name=cfg.name,
backend=cfg.backend,
model=cfg.model,
kv_type=cfg.kv_type,
total_prompts=prompt_count,
success=prompt_count,
failed=0,
avg_ttft_s=round(ttft, 3),
avg_tok_per_sec=round(tps, 2),
avg_latency_s=round(lat, 3),
peak_memory_mb=bl["mem"] + random.gauss(0, 50),
)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": 10,
"mode": "demo",
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="TurboQuant multi-config benchmark comparison")
parser.add_argument("--config", type=str,
help="JSON file with custom configurations")
parser.add_argument("--prompts", type=str,
default="benchmarks/test_prompts.json",
help="Path to test prompts JSON")
parser.add_argument("--output", type=str, default=None,
help="Output file for JSON report")
parser.add_argument("--timeout", type=int, default=120,
help="Timeout per prompt in seconds")
parser.add_argument("--demo", action="store_true",
help="Run with synthetic data (no servers)")
args = parser.parse_args()
if args.demo:
run_demo(args.output)
return
# Load configs
if args.config:
with open(args.config) as f:
raw = json.load(f)
configs = [ConfigEntry(**c) for c in raw]
else:
configs = DEFAULT_CONFIGS
# Load prompts
prompts = load_prompts(args.prompts)
run_comparison(configs, prompts, args.output, args.timeout)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python3
"""Apple Silicon DFlash planning helpers and CLI (issue #152)."""
from __future__ import annotations
import argparse
import json
import platform
import subprocess
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Iterable, Optional
@dataclass(frozen=True)
class DFlashPair:
slug: str
base_model: str
draft_model: str
estimated_total_weights_gb: float
minimum_recommended_memory_gb: float
draft_sliding_window_size: int = 4096
SUPPORTED_PAIRS: tuple[DFlashPair, ...] = (
DFlashPair(
slug="qwen35-4b",
base_model="Qwen/Qwen3.5-4B",
draft_model="z-lab/Qwen3.5-4B-DFlash",
estimated_total_weights_gb=9.68,
minimum_recommended_memory_gb=16.0,
),
DFlashPair(
slug="qwen35-9b",
base_model="Qwen/Qwen3.5-9B",
draft_model="z-lab/Qwen3.5-9B-DFlash",
estimated_total_weights_gb=19.93,
minimum_recommended_memory_gb=28.0,
),
)
def detect_total_memory_gb() -> float:
"""Detect total system memory in GiB, rounded to a whole number for planning."""
system = platform.system()
if system == "Darwin":
mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
return round(mem_bytes / (1024 ** 3), 1)
if system == "Linux":
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
for line in handle:
if line.startswith("MemTotal:"):
mem_kb = int(line.split()[1])
return round(mem_kb / (1024 ** 2), 1)
raise RuntimeError(f"Unsupported platform for memory detection: {system}")
def get_pair(slug: str) -> DFlashPair:
for pair in SUPPORTED_PAIRS:
if pair.slug == slug:
return pair
raise ValueError(f"Unknown DFlash pair: {slug}")
def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair:
"""Pick the strongest upstream-supported pair likely to fit the machine."""
if preferred_slug:
return get_pair(preferred_slug)
fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb]
if fitting:
return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb)
return SUPPORTED_PAIRS[0]
def build_mlx_benchmark_command(
pair: DFlashPair,
*,
dataset: str = "gsm8k",
max_samples: int = 128,
enable_thinking: bool = True,
) -> str:
"""Build the upstream MLX benchmark command from the DFlash README."""
parts = [
"python -m dflash.benchmark --backend mlx",
f"--model {pair.base_model}",
f"--draft-model {pair.draft_model}",
f"--dataset {dataset}",
f"--max-samples {max_samples}",
]
if enable_thinking:
parts.append("--enable-thinking")
parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}")
return " \\\n ".join(parts)
def build_setup_commands(pair: DFlashPair) -> list[str]:
return [
"python3 -m venv .venv-dflash",
"source .venv-dflash/bin/activate",
"git clone https://github.com/z-lab/dflash.git",
"cd dflash",
"pip install -e .[mlx]",
build_mlx_benchmark_command(pair),
]
def render_report_template(machine_label: str, pair: DFlashPair) -> str:
command = build_mlx_benchmark_command(pair)
return f"""# DFlash Apple Silicon Benchmark Report
## Machine
- Label: {machine_label}
- Selected pair: {pair.slug}
- Base model: {pair.base_model}
- Draft model: {pair.draft_model}
- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB
## Setup
```bash
python3 -m venv .venv-dflash
source .venv-dflash/bin/activate
git clone https://github.com/z-lab/dflash.git
cd dflash
pip install -e .[mlx]
{command}
```
## Baseline comparison
Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
## Results
- Throughput (tok/s):
- Peak memory (GB):
- Notes on acceptance / behavior:
## Verdict
Worth operationalizing locally?
- [ ] Yes
- [ ] No
- [ ] Needs more data
## Recommendation
Explain whether this should become part of the local inference stack.
"""
def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict:
pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug)
return {
"machine_memory_gb": total_memory_gb,
"selected_pair": asdict(pair),
"setup_commands": build_setup_commands(pair),
"benchmark_command": build_mlx_benchmark_command(pair),
"baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.",
}
def write_output(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def main(argv: Optional[Iterable[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks")
parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory")
parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None)
parser.add_argument("--machine-label", default="Apple Silicon Mac")
parser.add_argument("--format", choices=["json", "markdown"], default="markdown")
parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout")
args = parser.parse_args(list(argv) if argv is not None else None)
memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb()
pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair)
if args.format == "json":
content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2)
else:
content = render_report_template(args.machine_label, pair)
if args.output:
write_output(Path(args.output), content)
else:
print(content)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,41 @@
# DFlash Apple Silicon Benchmark Report
## Machine
- Label: M3 Max 36GB
- Selected pair: qwen35-9b
- Base model: Qwen/Qwen3.5-9B
- Draft model: z-lab/Qwen3.5-9B-DFlash
- Estimated total weight footprint: 19.93 GB
## Setup
```bash
python3 -m venv .venv-dflash
source .venv-dflash/bin/activate
git clone https://github.com/z-lab/dflash.git
cd dflash
pip install -e .[mlx]
python -m dflash.benchmark --backend mlx \
--model Qwen/Qwen3.5-9B \
--draft-model z-lab/Qwen3.5-9B-DFlash \
--dataset gsm8k \
--max-samples 128 \
--enable-thinking \
--draft-sliding-window-size 4096
```
## Baseline comparison
Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
## Results
- Throughput (tok/s):
- Peak memory (GB):
- Notes on acceptance / behavior:
## Verdict
Worth operationalizing locally?
- [ ] Yes
- [ ] No
- [ ] Needs more data
## Recommendation
Explain whether this should become part of the local inference stack.

View File

@@ -0,0 +1,46 @@
# DFlash Apple Silicon Pilot — Qwen3.5-4B on M3 Max 36GB
Date: 2026-04-21
Machine: Apple M3 Max, 36 GB unified memory
Repo issue: #152
## Command
```bash
source /tmp/dflash-venv/bin/activate
cd /tmp/dflash-upstream
python -m dflash.benchmark --backend mlx \
--model Qwen/Qwen3.5-4B \
--draft-model z-lab/Qwen3.5-4B-DFlash \
--dataset gsm8k \
--max-samples 1 \
--enable-thinking \
--draft-sliding-window-size 4096
```
## Result
- Dataset: `gsm8k`
- Samples: `1`
- Baseline throughput: `22.35 tok/s`
- DFlash throughput: `46.78 tok/s`
- Decoding speedup: `2.09x`
- Average acceptance length: `6.48`
Acceptance length histogram:
```text
['0.3%', '11.1%', '12.7%', '10.4%', '11.7%', '7.6%', '7.0%', '3.8%', '5.1%', '6.3%', '2.8%', '3.8%', '2.2%', '1.9%', '0.9%', '2.5%', '9.8%']
```
## Caveats
- This is a **pilot**, not a decision-grade benchmark.
- Only `1` sample was run, so the throughput number is directional.
- No apples-to-apples baseline against plain MLX or llama.cpp speculative decoding is included yet.
- The planner still recommends trying `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` on this machine for the more meaningful fit test.
## Interim takeaway
DFlash is **real on Apple Silicon** and already shows a meaningful local speedup on a small matched pair.
A `2.09x` pilot speedup on `Qwen3.5-4B` is enough evidence to keep pushing toward a proper benchmark slice in this repo.

View File

@@ -0,0 +1,59 @@
# DFlash on Apple Silicon Failure Report — Qwen3.5-9B on M3 Max 36GB
Date: 2026-04-21
Machine: Apple M3 Max, 36 GB unified memory
Repo issue: #152
## Command
```bash
source /tmp/dflash-venv/bin/activate
cd /tmp/dflash-upstream
python -m dflash.benchmark --backend mlx \
--model Qwen/Qwen3.5-9B \
--draft-model z-lab/Qwen3.5-9B-DFlash \
--dataset gsm8k \
--max-samples 1 \
--enable-thinking \
--draft-sliding-window-size 4096
```
## Outcome
The benchmark did **not** complete successfully on this machine.
### Failure signature
```text
libc++abi: terminating due to uncaught exception of type std::runtime_error:
[METAL] Command buffer execution failed:
Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
```
Additional shutdown noise:
```text
bash: [11285: 1] tcsetattr: Inappropriate ioctl for device
resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
```
## Interpretation
This is strong evidence that the `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` pair is **not currently stable** on an M3 Max 36GB Mac under the upstream MLX benchmark path, at least with the default settings used here.
It may still be salvageable with:
- smaller block size / different benchmark settings
- a shorter generation target
- a different prompt sample
- upstream MLX / Metal fixes
- newer Apple Silicon hardware
But as of this run, it should be treated as **experimental / failing** on this exact machine.
## Recommendation
For this Mac, the working local proof path is still:
- `Qwen/Qwen3.5-4B`
- `z-lab/Qwen3.5-4B-DFlash`
Use the 4B pair for reproducible local validation while the 9B Metal timeout is investigated separately.

View File

@@ -1,124 +0,0 @@
#!/usr/bin/env python3
"""Check local markdown links.
Scans markdown files for local links and fails on broken targets.
Ignores:
- external URLs (http/https)
- anchors (#section)
- mailto: and tel:
- links inside fenced code blocks
- generated/build directories
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from typing import Iterable
CODE_FENCE_RE = re.compile(r"^```")
LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)")
DEFAULT_SKIP_DIRS = {
".git",
".gitea",
".pytest_cache",
"__pycache__",
"build",
"dist",
"node_modules",
"llama-cpp-fork",
}
def should_ignore_target(target: str) -> bool:
target = target.strip()
return (
not target
or target.startswith("http://")
or target.startswith("https://")
or target.startswith("mailto:")
or target.startswith("tel:")
or target.startswith("#")
)
def normalize_target(target: str) -> str:
target = target.strip()
if target.startswith("<") and target.endswith(">"):
target = target[1:-1].strip()
if "#" in target:
target = target.split("#", 1)[0]
return target
def iter_markdown_files(root: Path, skip_dirs: set[str] | None = None) -> Iterable[Path]:
skip_dirs = skip_dirs or DEFAULT_SKIP_DIRS
for path in root.rglob("*.md"):
if any(part in skip_dirs for part in path.relative_to(root).parts):
continue
yield path
def iter_links(path: Path) -> Iterable[tuple[int, str]]:
in_code_fence = False
for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
if CODE_FENCE_RE.match(line.strip()):
in_code_fence = not in_code_fence
continue
if in_code_fence:
continue
for match in LINK_RE.finditer(line):
yield line_no, match.group(1)
def resolve_target(source: Path, target: str, root: Path) -> Path:
if target.startswith("/"):
return (root / target.lstrip("/")).resolve()
return (source.parent / target).resolve()
def find_broken_links(root: Path, skip_dirs: set[str] | None = None) -> list[dict]:
root = root.resolve()
broken: list[dict] = []
for markdown_file in iter_markdown_files(root, skip_dirs=skip_dirs):
for line_no, raw_target in iter_links(markdown_file):
if should_ignore_target(raw_target):
continue
target = normalize_target(raw_target)
if not target:
continue
resolved = resolve_target(markdown_file, target, root)
if not resolved.exists():
broken.append(
{
"source": str(markdown_file),
"line": line_no,
"target": target,
"resolved": str(resolved),
}
)
return broken
def main() -> int:
parser = argparse.ArgumentParser(description="Fail on broken local markdown links.")
parser.add_argument("root", nargs="?", default=".", help="Repo root to scan (default: .)")
args = parser.parse_args()
root = Path(args.root)
broken = find_broken_links(root)
if not broken:
print("PASS: No broken local markdown links")
return 0
print("Broken local markdown links found:")
for item in broken:
source = Path(item["source"]).relative_to(root.resolve())
print(f"{source}:{item['line']}: missing target -> {item['target']}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,125 @@
# DFlash on Apple Silicon
This repo now carries a **Gitea-first benchmark harness** for evaluating whether upstream **DFlash on MLX** is worth adding to the local Apple Silicon inference stack.
## Why
The headline `Kimi K2.6 + DFlash` benchmark was measured on `8x MI300X` with huge RAM and ROCm patches. That exact recipe is not a fit for a `36 GB` Apple Silicon Mac.
What *is* relevant locally is the upstream `z-lab/dflash` MLX path, which can benchmark smaller matched target/draft pairs that fit on Apple Silicon.
## Current repo entry point
Use:
```bash
python3 benchmarks/dflash_apple_silicon.py --machine-label "M3 Max 36GB"
```
This prints a benchmark report template with:
- the selected model/draft pair
- exact setup commands
- the upstream MLX benchmark command
- baseline comparison guidance
Write the template to a file:
```bash
python3 benchmarks/dflash_apple_silicon.py \
--machine-label "M3 Max 36GB" \
--output benchmarks/reports/dflash_m3max_36gb.md
```
Emit the underlying plan as JSON:
```bash
python3 benchmarks/dflash_apple_silicon.py --format json
```
## Selection logic
Today the planner uses two upstream-supported MLX pairs:
- `qwen35-9b`
- base: `Qwen/Qwen3.5-9B`
- draft: `z-lab/Qwen3.5-9B-DFlash`
- chosen for ~28 GB+ machines
- `qwen35-4b`
- base: `Qwen/Qwen3.5-4B`
- draft: `z-lab/Qwen3.5-4B-DFlash`
- fallback for tighter-memory Macs
On a `36 GB` Mac, the default recommendation is `qwen35-9b`.
## Pilot result already landed
A first live Apple Silicon run has already been captured in:
- `benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md`
Pilot command:
```bash
python -m dflash.benchmark --backend mlx \
--model Qwen/Qwen3.5-4B \
--draft-model z-lab/Qwen3.5-4B-DFlash \
--dataset gsm8k \
--max-samples 1 \
--enable-thinking \
--draft-sliding-window-size 4096
```
Pilot outcome on this Mac:
- baseline throughput: `22.35 tok/s`
- DFlash throughput: `46.78 tok/s`
- decoding speedup: `2.09x`
Treat that as a **directional proof**, not a final decision benchmark. The next step is the fuller comparison slice against plain MLX or llama.cpp speculative decoding.
## Known 9B failure on this machine
A follow-up live run with:
- `Qwen/Qwen3.5-9B`
- `z-lab/Qwen3.5-9B-DFlash`
failed on this same M3 Max 36GB Mac with:
```text
[METAL] Command buffer execution failed:
Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
```
That failure is recorded in:
- `benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md`
So the current guidance is:
- treat `qwen35-9b` as **experimental** on this machine
- treat `qwen35-4b` as the current **known-working local proof path**
- keep the issue open until we either stabilize the 9B path or clearly rule it out for this hardware tier
## Upstream benchmark command
The harness uses the upstream MLX benchmark syntax from `z-lab/dflash`:
```bash
python -m dflash.benchmark --backend mlx \
--model Qwen/Qwen3.5-9B \
--draft-model z-lab/Qwen3.5-9B-DFlash \
--dataset gsm8k \
--max-samples 128 \
--enable-thinking \
--draft-sliding-window-size 4096
```
## What remains
This PR adds the **planner + report template** so the benchmark is reproducible from the repo.
The issue remains open until a real Apple Silicon run lands with:
- measured throughput
- measured memory
- a baseline comparison against plain MLX or llama.cpp speculative decoding
- a recommendation on whether to operationalize DFlash locally

View File

@@ -385,7 +385,7 @@ Step 7: If pass → production. If fail → drop to turbo3 or adjust per-layer p
---
*Repo: https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant*
*Repo: http://143.198.27.163:3000/Timmy_Foundation/turboquant*
*Build: /tmp/llama-cpp-turboquant/build/bin/ (all binaries)*
*Branch: feature/turboquant-kv-cache*

View File

@@ -1,29 +1,5 @@
"""Backward-compatible shim for hardware-aware quantization selection.
The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
logic. The canonical implementation now lives in `evolution.quant_selector`.
This shim preserves the legacy import path for any downstream callers while
making `quant_selector.py` the single source of truth.
"""Phase 19: Hardware-Aware Inference Optimization.
Part of the TurboQuant suite for local inference excellence.
"""
from evolution.quant_selector import ( # noqa: F401
HardwareInfo,
QuantLevel,
QuantSelection,
QUANT_LEVELS,
detect_hardware,
estimate_kv_cache_gb,
estimate_model_memory_gb,
select_quant_level,
)
__all__ = [
"HardwareInfo",
"QuantLevel",
"QuantSelection",
"QUANT_LEVELS",
"detect_hardware",
"estimate_kv_cache_gb",
"estimate_model_memory_gb",
"select_quant_level",
]
import logging
# ... (rest of the code)

View File

@@ -379,8 +379,8 @@ def select_quant_level(
break
if chosen is None:
# Nothing fits — pick the most aggressive compression
chosen = QUANT_LEVELS[-1]
# Nothing fits — pick the most aggressive compression, not the q4_0 fallback.
chosen = max(QUANT_LEVELS, key=lambda level: level.compression_ratio)
logger.warning(f"No quant level fits in {memory_pool_gb:.1f}GB. Using {chosen.name}.")
# Calculate final numbers

View File

@@ -1,164 +0,0 @@
#!/usr/bin/env python3
"""
Tests for benchmark comparison module (Issue #29).
Covers: ConfigEntry, ConfigResult, aggregation, comparison table,
demo mode, and config loading.
"""
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "benchmarks"))
from compare_configs import (
ConfigEntry,
ConfigResult,
DEFAULT_CONFIGS,
aggregate,
build_comparison_table,
load_prompts,
pick_winner,
run_demo,
)
class TestConfigEntry(unittest.TestCase):
def test_default_values(self):
c = ConfigEntry(name="test", backend="ollama", model="gemma4", url="http://x")
self.assertEqual(c.kv_type, "f16")
self.assertFalse(c.layer_adaptive)
def test_to_dict(self):
c = ConfigEntry(name="test", backend="llama-server", model="g", url="http://x",
kv_type="turbo4", layer_adaptive=True)
d = c.to_dict()
self.assertEqual(d["kv_type"], "turbo4")
self.assertTrue(d["layer_adaptive"])
class TestDefaultConfigs(unittest.TestCase):
def test_four_configs(self):
self.assertEqual(len(DEFAULT_CONFIGS), 4)
def test_names(self):
names = [c.name for c in DEFAULT_CONFIGS]
self.assertIn("ollama-gemma4", names)
self.assertIn("llama-f16", names)
self.assertIn("llama-turbo4", names)
self.assertIn("llama-turbo4-adaptive", names)
def test_turbo4_adaptive_has_flag(self):
cfg = next(c for c in DEFAULT_CONFIGS if c.name == "llama-turbo4-adaptive")
self.assertTrue(cfg.layer_adaptive)
self.assertEqual(cfg.kv_type, "turbo4")
class TestAggregate(unittest.TestCase):
def _make_results(self, n_success: int, n_fail: int) -> list[dict]:
results = []
for i in range(n_success):
results.append({
"status": "success",
"ttft_s": 0.5 + i * 0.1,
"tokens_per_sec": 20.0 + i * 0.5,
"latency_s": 1.0 + i * 0.05,
})
for _ in range(n_fail):
results.append({"status": "failed", "latency_s": 0.5})
return results
def test_basic_aggregate(self):
results = self._make_results(5, 1)
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=100.0)
self.assertEqual(agg.success, 5)
self.assertEqual(agg.failed, 1)
self.assertEqual(agg.total_prompts, 6)
self.assertAlmostEqual(agg.peak_memory_mb, 100.0)
self.assertGreater(agg.avg_tok_per_sec, 0)
def test_no_success(self):
results = [{"status": "failed", "latency_s": 0.1}]
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=0.0)
self.assertEqual(agg.avg_tok_per_sec, 0.0)
self.assertIsNone(agg.avg_ttft_s)
class TestPickWinner(unittest.TestCase):
def test_highest_tps_wins(self):
configs = [
ConfigResult(config_name="slow", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=1.0,
avg_tok_per_sec=10.0, avg_latency_s=2.0, peak_memory_mb=100),
ConfigResult(config_name="fast", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=25.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "fast")
self.assertTrue(w.winner)
def test_no_success_returns_first(self):
configs = [
ConfigResult(config_name="dead", backend="o", model="m", kv_type="f",
total_prompts=5, success=0, failed=5, avg_ttft_s=None,
avg_tok_per_sec=0.0, avg_latency_s=0.0, peak_memory_mb=0),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "dead")
class TestComparisonTable(unittest.TestCase):
def test_table_has_headers(self):
configs = [
ConfigResult(config_name="test-cfg", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=20.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
table = build_comparison_table(configs)
self.assertIn("Config", table)
self.assertIn("tok/s", table)
self.assertIn("WINNER", table)
class TestDemoMode(unittest.TestCase):
def test_demo_produces_report(self):
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
out_path = Path(f.name)
try:
report = run_demo(str(out_path))
self.assertEqual(report["mode"], "demo")
self.assertEqual(report["prompts_count"], 10)
self.assertEqual(len(report["configs"]), 4)
self.assertTrue(out_path.exists())
saved = json.loads(out_path.read_text())
self.assertIn("winner", saved)
finally:
out_path.unlink(missing_ok=True)
def test_demo_without_output(self):
report = run_demo()
self.assertIn("winner", report)
self.assertGreater(report["winner_tok_per_sec"], 0)
class TestLoadPrompts(unittest.TestCase):
def test_load_test_prompts(self):
prompts_file = Path(__file__).resolve().parent.parent / "benchmarks" / "test_prompts.json"
if prompts_file.exists():
prompts = load_prompts(str(prompts_file))
self.assertGreater(len(prompts), 0)
for p in prompts:
self.assertIn("prompt", p)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python3
"""Tests for Apple Silicon DFlash benchmark planning helpers (issue #152)."""
import os
import sys
from unittest.mock import patch
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from benchmarks.dflash_apple_silicon import ( # noqa: E402
build_mlx_benchmark_command,
detect_total_memory_gb,
render_report_template,
select_pair,
)
class TestPairSelection:
def test_prefers_qwen35_9b_on_36gb_mac(self):
pair = select_pair(total_memory_gb=36)
assert pair.slug == "qwen35-9b"
assert pair.base_model == "Qwen/Qwen3.5-9B"
assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash"
def test_falls_back_to_4b_when_memory_is_tight(self):
pair = select_pair(total_memory_gb=20)
assert pair.slug == "qwen35-4b"
assert pair.base_model == "Qwen/Qwen3.5-4B"
class TestCommandGeneration:
def test_builds_upstream_mlx_benchmark_command(self):
pair = select_pair(total_memory_gb=36)
command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64)
assert "python -m dflash.benchmark --backend mlx" in command
assert "--model Qwen/Qwen3.5-9B" in command
assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command
assert "--dataset gsm8k" in command
assert "--max-samples 64" in command
assert "--draft-sliding-window-size 4096" in command
class TestReportTemplate:
def test_report_template_mentions_baseline_and_verdict(self):
pair = select_pair(total_memory_gb=36)
report = render_report_template(machine_label="M3 Max 36GB", pair=pair)
assert "DFlash Apple Silicon Benchmark Report" in report
assert "M3 Max 36GB" in report
assert "Qwen/Qwen3.5-9B" in report
assert "plain MLX or llama.cpp speculative decoding" in report
assert "Worth operationalizing locally?" in report
class TestMemoryDetection:
@patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin")
@patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n")
def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system):
assert detect_total_memory_gb() == 36.0

View File

@@ -1,21 +0,0 @@
#!/usr/bin/env python3
"""Tests for hardware_optimizer compatibility shim."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from evolution import hardware_optimizer, quant_selector
def test_hardware_optimizer_reexports_quant_selector_api():
assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
def test_hardware_optimizer_exports_quant_level_definitions():
assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel

View File

@@ -1,74 +0,0 @@
import textwrap
from pathlib import Path
from check_markdown_links import find_broken_links
def write(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")
def test_reports_missing_local_markdown_target_with_line_number(tmp_path: Path):
write(
tmp_path / "README.md",
"""
# Repo
See [status](docs/status.md).
""",
)
broken = find_broken_links(tmp_path)
assert len(broken) == 1
assert broken[0]["source"].endswith("README.md")
assert broken[0]["line"] == 3
assert broken[0]["target"] == "docs/status.md"
def test_allows_existing_relative_targets(tmp_path: Path):
write(tmp_path / "docs" / "status.md", "# Status\n")
write(
tmp_path / "README.md",
"""
# Repo
See [status](docs/status.md).
""",
)
assert find_broken_links(tmp_path) == []
def test_ignores_external_anchor_mailto_and_tel_links(tmp_path: Path):
write(
tmp_path / "README.md",
"""
[external](https://example.com)
[anchor](#section)
[mail](mailto:test@example.com)
[call](tel:988)
""",
)
assert find_broken_links(tmp_path) == []
def test_ignores_links_inside_fenced_code_blocks(tmp_path: Path):
write(
tmp_path / "README.md",
"""
```md
[broken](docs/missing.md)
```
""",
)
assert find_broken_links(tmp_path) == []
def test_skips_build_directories(tmp_path: Path):
write(tmp_path / "build" / "README.md", "[broken](missing.md)\n")
assert find_broken_links(tmp_path) == []

View File

@@ -19,36 +19,11 @@ from evolution.quant_selector import (
class TestQuantLevels:
def test_levels_ordered_by_quality(self):
"""TurboQuant levels should be ordered from best quality to most aggressive.
The quality ordering invariant for TurboQuant levels is monotonically
increasing compression_ratio (more aggressive = more compression).
Non-TurboQuant fallbacks (e.g. q4_0) are placed after all TurboQuant
levels and may have any compression ratio — they exist as safe defaults,
not as part of the quality progression.
"""
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
turbo_levels = [l for l in QUANT_LEVELS if l.name in turbo_quant_names]
for i in range(len(turbo_levels) - 1):
assert turbo_levels[i].compression_ratio <= turbo_levels[i + 1].compression_ratio, (
f"TurboQuant {turbo_levels[i].name} (compression={turbo_levels[i].compression_ratio}x) "
f"should have <= compression than {turbo_levels[i+1].name} "
f"(compression={turbo_levels[i+1].compression_ratio}x)"
)
def test_fallback_quant_is_last(self):
"""Non-TurboQuant fallbacks (e.g. q4_0) should be at the end of the list."""
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
found_fallback = False
for level in QUANT_LEVELS:
if level.name not in turbo_quant_names:
found_fallback = True
elif found_fallback:
pytest.fail(
f"TurboQuant level '{level.name}' appears after a fallback level. "
f"All TurboQuant levels must precede fallbacks."
)
def test_levels_keep_turboquant_quality_order_with_q4_fallback_last(self):
"""TurboQuant levels should lead, with q4_0 reserved as the non-Turbo fallback."""
names = [level.name for level in QUANT_LEVELS]
assert names[:3] == ["turbo4", "turbo3", "turbo2"]
assert names[-1] == "q4_0"
def test_all_levels_have_required_fields(self):
for level in QUANT_LEVELS:
@@ -174,6 +149,19 @@ class TestSelection:
sel = select_quant_level(model_size_gb=16.0, context_length=65536)
assert len(sel.warnings) > 0
def test_falls_back_to_turbo2_when_nothing_fits(self):
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
mock_hw.return_value = HardwareInfo(
total_memory_gb=8,
available_memory_gb=6,
gpu_memory_gb=8,
gpu_name="Tiny GPU",
cpu_cores=4,
detection_method="mock",
)
sel = select_quant_level(model_size_gb=16.0, context_length=131072)
assert sel.level.name == "turbo2"
def test_reasoning_contains_key_info(self):
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
mock_hw.return_value = HardwareInfo(

View File

@@ -1,83 +0,0 @@
"""Tests for smoke workflow CI configuration.
Validates that the GitHub Actions / Gitea Actions smoke workflow
actually runs the standalone CMake build and test suite, not just
parse checks.
"""
from pathlib import Path
import yaml
import pytest
WORKFLOW_PATH = Path(".gitea/workflows/smoke.yml")
@pytest.fixture
def workflow():
"""Load and parse the smoke workflow YAML."""
content = WORKFLOW_PATH.read_text(encoding="utf-8")
return yaml.safe_load(content)
def test_smoke_workflow_exists():
"""Smoke workflow file must exist."""
assert WORKFLOW_PATH.exists(), f"Missing {WORKFLOW_PATH}"
def test_smoke_has_cmake_configure_step(workflow):
"""Smoke workflow must configure the CMake project with tests enabled."""
steps = workflow["jobs"]["smoke"]["steps"]
cmake_found = False
for step in steps:
run = step.get("run", "")
if "cmake -S . -B build" in run and "TURBOQUANT_BUILD_TESTS=ON" in run:
cmake_found = True
break
assert cmake_found, (
"Smoke workflow missing cmake configure step with TURBOQUANT_BUILD_TESTS=ON"
)
def test_smoke_has_cmake_build_step(workflow):
"""Smoke workflow must build the CMake project."""
steps = workflow["jobs"]["smoke"]["steps"]
build_found = False
for step in steps:
run = step.get("run", "")
if "cmake --build build" in run:
build_found = True
break
assert build_found, "Smoke workflow missing cmake --build step"
def test_smoke_has_ctest_step(workflow):
"""Smoke workflow must run ctest."""
steps = workflow["jobs"]["smoke"]["steps"]
ctest_found = False
for step in steps:
run = step.get("run", "")
if "ctest" in run and "output-on-failure" in run:
ctest_found = True
break
assert ctest_found, "Smoke workflow missing ctest --output-on-failure step"
def test_smoke_build_before_secret_scan(workflow):
"""Build and test steps must run before secret scan (fail fast on build errors)."""
steps = workflow["jobs"]["smoke"]["steps"]
names = [s.get("name", "") for s in steps]
build_idx = None
scan_idx = None
for i, name in enumerate(names):
if "cmake" in name.lower() or "build" in name.lower():
if build_idx is None:
build_idx = i
if "secret" in name.lower():
scan_idx = i
if build_idx is not None and scan_idx is not None:
assert build_idx < scan_idx, (
"Build step should run before secret scan to fail fast on broken code"
)