Compare commits
9 Commits
burn/64-17
...
feat/152-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dabb96d315 | ||
|
|
69cef8a90f | ||
|
|
636d294896 | ||
| 492c1cdcfd | |||
| 6e583310a8 | |||
| 300918ee1e | |||
| f7ea01cb65 | |||
| d2edbdadc2 | |||
| c009d8df77 |
@@ -30,3 +30,4 @@ See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/i
|
||||
|
||||
## Docs
|
||||
- [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification
|
||||
- [DFlash on Apple Silicon](docs/DFLASH_APPLE_SILICON.md) — MLX benchmark planner, setup commands, and report workflow
|
||||
|
||||
189
benchmarks/dflash_apple_silicon.py
Normal file
189
benchmarks/dflash_apple_silicon.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Apple Silicon DFlash planning helpers and CLI (issue #152)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import platform
|
||||
import subprocess
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DFlashPair:
|
||||
slug: str
|
||||
base_model: str
|
||||
draft_model: str
|
||||
estimated_total_weights_gb: float
|
||||
minimum_recommended_memory_gb: float
|
||||
draft_sliding_window_size: int = 4096
|
||||
|
||||
|
||||
SUPPORTED_PAIRS: tuple[DFlashPair, ...] = (
|
||||
DFlashPair(
|
||||
slug="qwen35-4b",
|
||||
base_model="Qwen/Qwen3.5-4B",
|
||||
draft_model="z-lab/Qwen3.5-4B-DFlash",
|
||||
estimated_total_weights_gb=9.68,
|
||||
minimum_recommended_memory_gb=16.0,
|
||||
),
|
||||
DFlashPair(
|
||||
slug="qwen35-9b",
|
||||
base_model="Qwen/Qwen3.5-9B",
|
||||
draft_model="z-lab/Qwen3.5-9B-DFlash",
|
||||
estimated_total_weights_gb=19.93,
|
||||
minimum_recommended_memory_gb=28.0,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def detect_total_memory_gb() -> float:
|
||||
"""Detect total system memory in GiB, rounded to a whole number for planning."""
|
||||
system = platform.system()
|
||||
if system == "Darwin":
|
||||
mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
|
||||
return round(mem_bytes / (1024 ** 3), 1)
|
||||
if system == "Linux":
|
||||
with open("/proc/meminfo", "r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
if line.startswith("MemTotal:"):
|
||||
mem_kb = int(line.split()[1])
|
||||
return round(mem_kb / (1024 ** 2), 1)
|
||||
raise RuntimeError(f"Unsupported platform for memory detection: {system}")
|
||||
|
||||
|
||||
def get_pair(slug: str) -> DFlashPair:
|
||||
for pair in SUPPORTED_PAIRS:
|
||||
if pair.slug == slug:
|
||||
return pair
|
||||
raise ValueError(f"Unknown DFlash pair: {slug}")
|
||||
|
||||
|
||||
def select_pair(total_memory_gb: float, preferred_slug: Optional[str] = None) -> DFlashPair:
|
||||
"""Pick the strongest upstream-supported pair likely to fit the machine."""
|
||||
if preferred_slug:
|
||||
return get_pair(preferred_slug)
|
||||
|
||||
fitting = [pair for pair in SUPPORTED_PAIRS if total_memory_gb >= pair.minimum_recommended_memory_gb]
|
||||
if fitting:
|
||||
return max(fitting, key=lambda pair: pair.minimum_recommended_memory_gb)
|
||||
return SUPPORTED_PAIRS[0]
|
||||
|
||||
|
||||
def build_mlx_benchmark_command(
|
||||
pair: DFlashPair,
|
||||
*,
|
||||
dataset: str = "gsm8k",
|
||||
max_samples: int = 128,
|
||||
enable_thinking: bool = True,
|
||||
) -> str:
|
||||
"""Build the upstream MLX benchmark command from the DFlash README."""
|
||||
parts = [
|
||||
"python -m dflash.benchmark --backend mlx",
|
||||
f"--model {pair.base_model}",
|
||||
f"--draft-model {pair.draft_model}",
|
||||
f"--dataset {dataset}",
|
||||
f"--max-samples {max_samples}",
|
||||
]
|
||||
if enable_thinking:
|
||||
parts.append("--enable-thinking")
|
||||
parts.append(f"--draft-sliding-window-size {pair.draft_sliding_window_size}")
|
||||
return " \\\n ".join(parts)
|
||||
|
||||
|
||||
def build_setup_commands(pair: DFlashPair) -> list[str]:
|
||||
return [
|
||||
"python3 -m venv .venv-dflash",
|
||||
"source .venv-dflash/bin/activate",
|
||||
"git clone https://github.com/z-lab/dflash.git",
|
||||
"cd dflash",
|
||||
"pip install -e .[mlx]",
|
||||
build_mlx_benchmark_command(pair),
|
||||
]
|
||||
|
||||
|
||||
def render_report_template(machine_label: str, pair: DFlashPair) -> str:
|
||||
command = build_mlx_benchmark_command(pair)
|
||||
return f"""# DFlash Apple Silicon Benchmark Report
|
||||
|
||||
## Machine
|
||||
- Label: {machine_label}
|
||||
- Selected pair: {pair.slug}
|
||||
- Base model: {pair.base_model}
|
||||
- Draft model: {pair.draft_model}
|
||||
- Estimated total weight footprint: {pair.estimated_total_weights_gb:.2f} GB
|
||||
|
||||
## Setup
|
||||
```bash
|
||||
python3 -m venv .venv-dflash
|
||||
source .venv-dflash/bin/activate
|
||||
git clone https://github.com/z-lab/dflash.git
|
||||
cd dflash
|
||||
pip install -e .[mlx]
|
||||
{command}
|
||||
```
|
||||
|
||||
## Baseline comparison
|
||||
Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
|
||||
|
||||
## Results
|
||||
- Throughput (tok/s):
|
||||
- Peak memory (GB):
|
||||
- Notes on acceptance / behavior:
|
||||
|
||||
## Verdict
|
||||
Worth operationalizing locally?
|
||||
- [ ] Yes
|
||||
- [ ] No
|
||||
- [ ] Needs more data
|
||||
|
||||
## Recommendation
|
||||
Explain whether this should become part of the local inference stack.
|
||||
"""
|
||||
|
||||
|
||||
def build_plan(total_memory_gb: float, preferred_slug: Optional[str] = None) -> dict:
|
||||
pair = select_pair(total_memory_gb=total_memory_gb, preferred_slug=preferred_slug)
|
||||
return {
|
||||
"machine_memory_gb": total_memory_gb,
|
||||
"selected_pair": asdict(pair),
|
||||
"setup_commands": build_setup_commands(pair),
|
||||
"benchmark_command": build_mlx_benchmark_command(pair),
|
||||
"baseline_note": "Compare against plain MLX or llama.cpp speculative decoding on the same prompt set.",
|
||||
}
|
||||
|
||||
|
||||
def write_output(path: Path, content: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def main(argv: Optional[Iterable[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Plan Apple Silicon DFlash benchmarks")
|
||||
parser.add_argument("--memory-gb", type=float, default=None, help="Override detected total memory")
|
||||
parser.add_argument("--pair", choices=[pair.slug for pair in SUPPORTED_PAIRS], default=None)
|
||||
parser.add_argument("--machine-label", default="Apple Silicon Mac")
|
||||
parser.add_argument("--format", choices=["json", "markdown"], default="markdown")
|
||||
parser.add_argument("--output", default=None, help="Write plan/report to file instead of stdout")
|
||||
args = parser.parse_args(list(argv) if argv is not None else None)
|
||||
|
||||
memory_gb = args.memory_gb if args.memory_gb is not None else detect_total_memory_gb()
|
||||
pair = select_pair(total_memory_gb=memory_gb, preferred_slug=args.pair)
|
||||
|
||||
if args.format == "json":
|
||||
content = json.dumps(build_plan(memory_gb, preferred_slug=pair.slug), indent=2)
|
||||
else:
|
||||
content = render_report_template(args.machine_label, pair)
|
||||
|
||||
if args.output:
|
||||
write_output(Path(args.output), content)
|
||||
else:
|
||||
print(content)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
41
benchmarks/reports/dflash_m3max_36gb.md
Normal file
41
benchmarks/reports/dflash_m3max_36gb.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# DFlash Apple Silicon Benchmark Report
|
||||
|
||||
## Machine
|
||||
- Label: M3 Max 36GB
|
||||
- Selected pair: qwen35-9b
|
||||
- Base model: Qwen/Qwen3.5-9B
|
||||
- Draft model: z-lab/Qwen3.5-9B-DFlash
|
||||
- Estimated total weight footprint: 19.93 GB
|
||||
|
||||
## Setup
|
||||
```bash
|
||||
python3 -m venv .venv-dflash
|
||||
source .venv-dflash/bin/activate
|
||||
git clone https://github.com/z-lab/dflash.git
|
||||
cd dflash
|
||||
pip install -e .[mlx]
|
||||
python -m dflash.benchmark --backend mlx \
|
||||
--model Qwen/Qwen3.5-9B \
|
||||
--draft-model z-lab/Qwen3.5-9B-DFlash \
|
||||
--dataset gsm8k \
|
||||
--max-samples 128 \
|
||||
--enable-thinking \
|
||||
--draft-sliding-window-size 4096
|
||||
```
|
||||
|
||||
## Baseline comparison
|
||||
Compare against **plain MLX or llama.cpp speculative decoding** on the same prompt set.
|
||||
|
||||
## Results
|
||||
- Throughput (tok/s):
|
||||
- Peak memory (GB):
|
||||
- Notes on acceptance / behavior:
|
||||
|
||||
## Verdict
|
||||
Worth operationalizing locally?
|
||||
- [ ] Yes
|
||||
- [ ] No
|
||||
- [ ] Needs more data
|
||||
|
||||
## Recommendation
|
||||
Explain whether this should become part of the local inference stack.
|
||||
46
benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md
Normal file
46
benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# DFlash Apple Silicon Pilot — Qwen3.5-4B on M3 Max 36GB
|
||||
|
||||
Date: 2026-04-21
|
||||
Machine: Apple M3 Max, 36 GB unified memory
|
||||
Repo issue: #152
|
||||
|
||||
## Command
|
||||
|
||||
```bash
|
||||
source /tmp/dflash-venv/bin/activate
|
||||
cd /tmp/dflash-upstream
|
||||
python -m dflash.benchmark --backend mlx \
|
||||
--model Qwen/Qwen3.5-4B \
|
||||
--draft-model z-lab/Qwen3.5-4B-DFlash \
|
||||
--dataset gsm8k \
|
||||
--max-samples 1 \
|
||||
--enable-thinking \
|
||||
--draft-sliding-window-size 4096
|
||||
```
|
||||
|
||||
## Result
|
||||
|
||||
- Dataset: `gsm8k`
|
||||
- Samples: `1`
|
||||
- Baseline throughput: `22.35 tok/s`
|
||||
- DFlash throughput: `46.78 tok/s`
|
||||
- Decoding speedup: `2.09x`
|
||||
- Average acceptance length: `6.48`
|
||||
|
||||
Acceptance length histogram:
|
||||
|
||||
```text
|
||||
['0.3%', '11.1%', '12.7%', '10.4%', '11.7%', '7.6%', '7.0%', '3.8%', '5.1%', '6.3%', '2.8%', '3.8%', '2.2%', '1.9%', '0.9%', '2.5%', '9.8%']
|
||||
```
|
||||
|
||||
## Caveats
|
||||
|
||||
- This is a **pilot**, not a decision-grade benchmark.
|
||||
- Only `1` sample was run, so the throughput number is directional.
|
||||
- No apples-to-apples baseline against plain MLX or llama.cpp speculative decoding is included yet.
|
||||
- The planner still recommends trying `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` on this machine for the more meaningful fit test.
|
||||
|
||||
## Interim takeaway
|
||||
|
||||
DFlash is **real on Apple Silicon** and already shows a meaningful local speedup on a small matched pair.
|
||||
A `2.09x` pilot speedup on `Qwen3.5-4B` is enough evidence to keep pushing toward a proper benchmark slice in this repo.
|
||||
59
benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md
Normal file
59
benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# DFlash on Apple Silicon Failure Report — Qwen3.5-9B on M3 Max 36GB
|
||||
|
||||
Date: 2026-04-21
|
||||
Machine: Apple M3 Max, 36 GB unified memory
|
||||
Repo issue: #152
|
||||
|
||||
## Command
|
||||
|
||||
```bash
|
||||
source /tmp/dflash-venv/bin/activate
|
||||
cd /tmp/dflash-upstream
|
||||
python -m dflash.benchmark --backend mlx \
|
||||
--model Qwen/Qwen3.5-9B \
|
||||
--draft-model z-lab/Qwen3.5-9B-DFlash \
|
||||
--dataset gsm8k \
|
||||
--max-samples 1 \
|
||||
--enable-thinking \
|
||||
--draft-sliding-window-size 4096
|
||||
```
|
||||
|
||||
## Outcome
|
||||
|
||||
The benchmark did **not** complete successfully on this machine.
|
||||
|
||||
### Failure signature
|
||||
|
||||
```text
|
||||
libc++abi: terminating due to uncaught exception of type std::runtime_error:
|
||||
[METAL] Command buffer execution failed:
|
||||
Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
|
||||
```
|
||||
|
||||
Additional shutdown noise:
|
||||
|
||||
```text
|
||||
bash: [11285: 1] tcsetattr: Inappropriate ioctl for device
|
||||
resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
|
||||
```
|
||||
|
||||
## Interpretation
|
||||
|
||||
This is strong evidence that the `Qwen/Qwen3.5-9B + z-lab/Qwen3.5-9B-DFlash` pair is **not currently stable** on an M3 Max 36GB Mac under the upstream MLX benchmark path, at least with the default settings used here.
|
||||
|
||||
It may still be salvageable with:
|
||||
- smaller block size / different benchmark settings
|
||||
- a shorter generation target
|
||||
- a different prompt sample
|
||||
- upstream MLX / Metal fixes
|
||||
- newer Apple Silicon hardware
|
||||
|
||||
But as of this run, it should be treated as **experimental / failing** on this exact machine.
|
||||
|
||||
## Recommendation
|
||||
|
||||
For this Mac, the working local proof path is still:
|
||||
- `Qwen/Qwen3.5-4B`
|
||||
- `z-lab/Qwen3.5-4B-DFlash`
|
||||
|
||||
Use the 4B pair for reproducible local validation while the 9B Metal timeout is investigated separately.
|
||||
125
docs/DFLASH_APPLE_SILICON.md
Normal file
125
docs/DFLASH_APPLE_SILICON.md
Normal file
@@ -0,0 +1,125 @@
|
||||
# DFlash on Apple Silicon
|
||||
|
||||
This repo now carries a **Gitea-first benchmark harness** for evaluating whether upstream **DFlash on MLX** is worth adding to the local Apple Silicon inference stack.
|
||||
|
||||
## Why
|
||||
|
||||
The headline `Kimi K2.6 + DFlash` benchmark was measured on `8x MI300X` with huge RAM and ROCm patches. That exact recipe is not a fit for a `36 GB` Apple Silicon Mac.
|
||||
|
||||
What *is* relevant locally is the upstream `z-lab/dflash` MLX path, which can benchmark smaller matched target/draft pairs that fit on Apple Silicon.
|
||||
|
||||
## Current repo entry point
|
||||
|
||||
Use:
|
||||
|
||||
```bash
|
||||
python3 benchmarks/dflash_apple_silicon.py --machine-label "M3 Max 36GB"
|
||||
```
|
||||
|
||||
This prints a benchmark report template with:
|
||||
- the selected model/draft pair
|
||||
- exact setup commands
|
||||
- the upstream MLX benchmark command
|
||||
- baseline comparison guidance
|
||||
|
||||
Write the template to a file:
|
||||
|
||||
```bash
|
||||
python3 benchmarks/dflash_apple_silicon.py \
|
||||
--machine-label "M3 Max 36GB" \
|
||||
--output benchmarks/reports/dflash_m3max_36gb.md
|
||||
```
|
||||
|
||||
Emit the underlying plan as JSON:
|
||||
|
||||
```bash
|
||||
python3 benchmarks/dflash_apple_silicon.py --format json
|
||||
```
|
||||
|
||||
## Selection logic
|
||||
|
||||
Today the planner uses two upstream-supported MLX pairs:
|
||||
|
||||
- `qwen35-9b`
|
||||
- base: `Qwen/Qwen3.5-9B`
|
||||
- draft: `z-lab/Qwen3.5-9B-DFlash`
|
||||
- chosen for ~28 GB+ machines
|
||||
- `qwen35-4b`
|
||||
- base: `Qwen/Qwen3.5-4B`
|
||||
- draft: `z-lab/Qwen3.5-4B-DFlash`
|
||||
- fallback for tighter-memory Macs
|
||||
|
||||
On a `36 GB` Mac, the default recommendation is `qwen35-9b`.
|
||||
|
||||
## Pilot result already landed
|
||||
|
||||
A first live Apple Silicon run has already been captured in:
|
||||
|
||||
- `benchmarks/reports/dflash_m3max_36gb_qwen35_4b_pilot.md`
|
||||
|
||||
Pilot command:
|
||||
|
||||
```bash
|
||||
python -m dflash.benchmark --backend mlx \
|
||||
--model Qwen/Qwen3.5-4B \
|
||||
--draft-model z-lab/Qwen3.5-4B-DFlash \
|
||||
--dataset gsm8k \
|
||||
--max-samples 1 \
|
||||
--enable-thinking \
|
||||
--draft-sliding-window-size 4096
|
||||
```
|
||||
|
||||
Pilot outcome on this Mac:
|
||||
|
||||
- baseline throughput: `22.35 tok/s`
|
||||
- DFlash throughput: `46.78 tok/s`
|
||||
- decoding speedup: `2.09x`
|
||||
|
||||
Treat that as a **directional proof**, not a final decision benchmark. The next step is the fuller comparison slice against plain MLX or llama.cpp speculative decoding.
|
||||
|
||||
## Known 9B failure on this machine
|
||||
|
||||
A follow-up live run with:
|
||||
|
||||
- `Qwen/Qwen3.5-9B`
|
||||
- `z-lab/Qwen3.5-9B-DFlash`
|
||||
|
||||
failed on this same M3 Max 36GB Mac with:
|
||||
|
||||
```text
|
||||
[METAL] Command buffer execution failed:
|
||||
Caused GPU Timeout Error (00000002:kIOGPUCommandBufferCallbackErrorTimeout)
|
||||
```
|
||||
|
||||
That failure is recorded in:
|
||||
|
||||
- `benchmarks/reports/dflash_m3max_36gb_qwen35_9b_timeout.md`
|
||||
|
||||
So the current guidance is:
|
||||
- treat `qwen35-9b` as **experimental** on this machine
|
||||
- treat `qwen35-4b` as the current **known-working local proof path**
|
||||
- keep the issue open until we either stabilize the 9B path or clearly rule it out for this hardware tier
|
||||
|
||||
## Upstream benchmark command
|
||||
|
||||
The harness uses the upstream MLX benchmark syntax from `z-lab/dflash`:
|
||||
|
||||
```bash
|
||||
python -m dflash.benchmark --backend mlx \
|
||||
--model Qwen/Qwen3.5-9B \
|
||||
--draft-model z-lab/Qwen3.5-9B-DFlash \
|
||||
--dataset gsm8k \
|
||||
--max-samples 128 \
|
||||
--enable-thinking \
|
||||
--draft-sliding-window-size 4096
|
||||
```
|
||||
|
||||
## What remains
|
||||
|
||||
This PR adds the **planner + report template** so the benchmark is reproducible from the repo.
|
||||
The issue remains open until a real Apple Silicon run lands with:
|
||||
|
||||
- measured throughput
|
||||
- measured memory
|
||||
- a baseline comparison against plain MLX or llama.cpp speculative decoding
|
||||
- a recommendation on whether to operationalize DFlash locally
|
||||
@@ -1,21 +1,6 @@
|
||||
# PROJECT STATUS — Living Tracker
|
||||
|
||||
> **For current status, see [STATUS_TRACKER.md](./STATUS_TRACKER.md).**
|
||||
> Updated on each milestone. This file contains detailed phase reports.
|
||||
>
|
||||
> Quick view:
|
||||
> - Phase 1: DONE
|
||||
> - Phase 2: IN PROGRESS
|
||||
> - Edge Crisis Detection: DONE
|
||||
> - Integration PR: NOT STARTED
|
||||
> - QJL: NOT STARTED
|
||||
> - Ollama: NOT STARTED
|
||||
|
||||
---
|
||||
|
||||
# TurboQuant Project Status
|
||||
|
||||
# TurboQuant Phase 1 Report â PolarQuant MVP
|
||||
# TurboQuant Phase 1 Report — PolarQuant MVP
|
||||
|
||||
**Date:** 2026-03-30
|
||||
**Prepared by:** Timmy (execution) for Frankie's team (Strago, Cid, Locke, John)
|
||||
@@ -31,13 +16,13 @@ Phase 1 is COMPLETE. TurboQuant KV cache compression works on Apple Silicon with
|
||||
|
||||
---
|
||||
|
||||
## Gate Check (#2): PASSED â
|
||||
## Gate Check (#2): PASSED ✅
|
||||
|
||||
Metal shaders exist and are comprehensive:
|
||||
- Full flash attention for turbo2/3/4 with dk32-dk576 variants
|
||||
- WHT rotation kernels (turbo_fwht_128, turbo_rotate_forward/inverse)
|
||||
- PolarQuant codebooks hardcoded (Lloyd-Max for N(0, 1/â128))
|
||||
- Asymmetric K/V support (q8_0 Ã turbo mixed pairs)
|
||||
- PolarQuant codebooks hardcoded (Lloyd-Max for N(0, 1/√128))
|
||||
- Asymmetric K/V support (q8_0 × turbo mixed pairs)
|
||||
- M4+ optimizations (4-mag LUT), sparse V dequant, profiling modes
|
||||
- Additional experiment branches: layer-adaptive, fused-centroid-decode, speed-optimization
|
||||
|
||||
@@ -45,7 +30,7 @@ Metal shaders exist and are comprehensive:
|
||||
|
||||
---
|
||||
|
||||
## Fork Assessment (#3): PASSED â
|
||||
## Fork Assessment (#3): PASSED ✅
|
||||
|
||||
- Branch: `feature/turboquant-kv-cache` (commit adac2c6)
|
||||
- Fork freshness: ADEQUATE (recent enough for direct build)
|
||||
@@ -54,18 +39,18 @@ Metal shaders exist and are comprehensive:
|
||||
|
||||
---
|
||||
|
||||
## PolarQuant Verification (#5): 5/6 PASS, 1 PARTIAL â
|
||||
## PolarQuant Verification (#5): 5/6 PASS, 1 PARTIAL ✅
|
||||
|
||||
| Item | Verdict |
|
||||
|------|---------|
|
||||
| WHT rotation (structured orthogonal) | PARTIAL PASS â Metal GPU uses WHT â
. CPU turbo4 ref uses dense random (legacy, not production) |
|
||||
| Same rotation quant/dequant | PASS â turbo_rotate_forward() â turbo_rotate_inverse() identical sign arrays |
|
||||
| Lloyd-Max codebook (not uniform) | PASS â non-uniform centroids, "Lloyd-Max for N(0, 1/128)" |
|
||||
| Radius at FP16+ | PASS â ggml_half norm per 128-element group |
|
||||
| No per-vector normalization | PASS â one group norm only, static_asserts enforce block sizes |
|
||||
| Dequant matches quant in Metal | PASS â same centroids, signs, butterfly structure |
|
||||
| WHT rotation (structured orthogonal) | PARTIAL PASS — Metal GPU uses WHT ✅. CPU turbo4 ref uses dense random (legacy, not production) |
|
||||
| Same rotation quant/dequant | PASS — turbo_rotate_forward() ↔ turbo_rotate_inverse() identical sign arrays |
|
||||
| Lloyd-Max codebook (not uniform) | PASS — non-uniform centroids, "Lloyd-Max for N(0, 1/128)" |
|
||||
| Radius at FP16+ | PASS — ggml_half norm per 128-element group |
|
||||
| No per-vector normalization | PASS — one group norm only, static_asserts enforce block sizes |
|
||||
| Dequant matches quant in Metal | PASS — same centroids, signs, butterfly structure |
|
||||
|
||||
**â ï¸ Flag for Cid:** CPU turbo4 reference path is incompatible with Metal dequant. Only matters if CPU fallback is ever invoked for turbo4.
|
||||
**⚠️ Flag for Cid:** CPU turbo4 reference path is incompatible with Metal dequant. Only matters if CPU fallback is ever invoked for turbo4.
|
||||
|
||||
---
|
||||
|
||||
@@ -77,9 +62,9 @@ Metal shaders exist and are comprehensive:
|
||||
|
||||
### Throughput (3-run averages)
|
||||
|
||||
| Config (K/V) | Prompt (pp512) | Î | Generation (tg128) | Î |
|
||||
| Config (K/V) | Prompt (pp512) | Δ | Generation (tg128) | Δ |
|
||||
|:-------------|:---------------|:--|:-------------------|:--|
|
||||
| f16/f16 (baseline) | 304.28 t/s | â | 27.47 t/s | â |
|
||||
| f16/f16 (baseline) | 304.28 t/s | — | 27.47 t/s | — |
|
||||
| **turbo4/turbo4** | **300.00 t/s** | **-1.1%** | **22.45 t/s** | **-11.1%** |
|
||||
| turbo3/turbo3 | 271.07 t/s | -10.7% | 21.07 t/s | -16.6% |
|
||||
| q8_0/turbo4 (asym) | 260.57 t/s | -14.1% | 23.75 t/s | -5.9% |
|
||||
@@ -93,17 +78,17 @@ Metal shaders exist and are comprehensive:
|
||||
| 32K | 5,120 MiB | 1,360 MiB | 73.4% |
|
||||
| 65K | 10,240 MiB | 2,720 MiB | 73.4% |
|
||||
|
||||
Measured matches calculated exactly â zero fragmentation overhead.
|
||||
Measured matches calculated exactly — zero fragmentation overhead.
|
||||
|
||||
### Pass Criteria Assessment
|
||||
|
||||
| Criteria | Threshold | Result | Verdict |
|
||||
|:---------|:----------|:-------|:--------|
|
||||
| PPL delta ⤠0.5 | ⤠0.5 | âï¸ Not tested (no wikitext corpus) | DEFERRED |
|
||||
| tok/s ⥠90% baseline (prompt) | ⥠274 t/s | 300.00 t/s (98.9%) | **PASS** |
|
||||
| tok/s ⥠90% baseline (gen) | ⥠24.7 t/s | 22.45 t/s (89%) | **BORDERLINE** |
|
||||
| PPL delta ≤ 0.5 | ≤ 0.5 | ⏭️ Not tested (no wikitext corpus) | DEFERRED |
|
||||
| tok/s ≥ 90% baseline (prompt) | ≥ 274 t/s | 300.00 t/s (98.9%) | **PASS** |
|
||||
| tok/s ≥ 90% baseline (gen) | ≥ 24.7 t/s | 22.45 t/s (89%) | **BORDERLINE** |
|
||||
| No OOM at 32K | No crash | Runs clean | **PASS** |
|
||||
| Memory consistent with theory | ±15% | 0% delta | **PASS** |
|
||||
| Memory consistent with theory | ±15% | 0% delta | **PASS** |
|
||||
|
||||
---
|
||||
|
||||
@@ -111,10 +96,10 @@ Measured matches calculated exactly â zero fragmentation overhead.
|
||||
|
||||
| Scenario | Total Memory | Fits in 31GB? |
|
||||
|:---------|:-------------|:--------------|
|
||||
| 27B Q4_K_M + f16 KV @ 64K | ~26 GB | â ï¸ Tight |
|
||||
| 27B Q4_K_M + f16 KV @ 128K | ~38 GB | â No |
|
||||
| 27B Q4_K_M + **turbo4 KV @ 64K** | ~20.5 GB | â
Comfortable |
|
||||
| 27B Q4_K_M + **turbo4 KV @ 128K** | ~23.4 GB | â
Fits (7.6GB headroom) |
|
||||
| 27B Q4_K_M + f16 KV @ 64K | ~26 GB | ⚠️ Tight |
|
||||
| 27B Q4_K_M + f16 KV @ 128K | ~38 GB | ❌ No |
|
||||
| 27B Q4_K_M + **turbo4 KV @ 64K** | ~20.5 GB | ✅ Comfortable |
|
||||
| 27B Q4_K_M + **turbo4 KV @ 128K** | ~23.4 GB | ✅ Fits (7.6GB headroom) |
|
||||
|
||||
**TurboQuant turns 128K context from impossible to comfortable.**
|
||||
|
||||
@@ -122,11 +107,11 @@ Measured matches calculated exactly â zero fragmentation overhead.
|
||||
|
||||
## Open Items for Phase 2
|
||||
|
||||
1. **Perplexity test** â Need wikitext-2-raw corpus downloaded. PPL is the most important quality metric and we don't have it yet.
|
||||
2. **Ollama integration** â CLI is a broken symlink. Need to fix Ollama install, then build custom Ollama with our fork as submodule.
|
||||
3. **qwen3.5:27b model** â Need to download the actual target model (only have Hermes-4-14B on disk currently).
|
||||
4. **10 test prompts** â Need to be written before Phase 2 quality comparison.
|
||||
5. **Generation speed borderline** â tg128 at 89% is just below the 90% threshold. May improve with the speed-optimization branch. Worth testing.
|
||||
1. **Perplexity test** — Need wikitext-2-raw corpus downloaded. PPL is the most important quality metric and we don't have it yet.
|
||||
2. **Ollama integration** — CLI is a broken symlink. Need to fix Ollama install, then build custom Ollama with our fork as submodule.
|
||||
3. **qwen3.5:27b model** — Need to download the actual target model (only have Hermes-4-14B on disk currently).
|
||||
4. **10 test prompts** — Need to be written before Phase 2 quality comparison.
|
||||
5. **Generation speed borderline** — tg128 at 89% is just below the 90% threshold. May improve with the speed-optimization branch. Worth testing.
|
||||
|
||||
---
|
||||
|
||||
@@ -134,21 +119,21 @@ Measured matches calculated exactly â zero fragmentation overhead.
|
||||
|
||||
**PROCEED TO PHASE 2.**
|
||||
|
||||
turbo4 delivers the goods: 73% KV memory savings, near-zero prompt overhead, acceptable generation overhead. The verification checklist confirms the implementation is algorithmically sound. The only gap is PPL testing, which is a corpus download away â not a fundamental risk.
|
||||
turbo4 delivers the goods: 73% KV memory savings, near-zero prompt overhead, acceptable generation overhead. The verification checklist confirms the implementation is algorithmically sound. The only gap is PPL testing, which is a corpus download away — not a fundamental risk.
|
||||
|
||||
The real unlock â 128K context on 36GB hardware â is within reach. Phase 2 is Ollama integration and production deployment.
|
||||
The real unlock — 128K context on 36GB hardware — is within reach. Phase 2 is Ollama integration and production deployment.
|
||||
|
||||
---
|
||||
|
||||
## Issues Closed
|
||||
|
||||
- [x] #2 Metal kernel check â PASSED
|
||||
- [x] #3 Fork assessment â PASSED
|
||||
- [x] #4 Build llama.cpp fork â COMPLETE
|
||||
- [x] #5 PolarQuant verification â 5/6 PASS
|
||||
- [x] #6 FP16 baseline benchmarks â RECORDED
|
||||
- [x] #7 TurboQuant benchmarks â RECORDED
|
||||
- [x] #8 Memory profiling â COMPLETE
|
||||
- [x] #2 Metal kernel check — PASSED
|
||||
- [x] #3 Fork assessment — PASSED
|
||||
- [x] #4 Build llama.cpp fork — COMPLETE
|
||||
- [x] #5 PolarQuant verification — 5/6 PASS
|
||||
- [x] #6 FP16 baseline benchmarks — RECORDED
|
||||
- [x] #7 TurboQuant benchmarks — RECORDED
|
||||
- [x] #8 Memory profiling — COMPLETE
|
||||
|
||||
---
|
||||
|
||||
@@ -158,7 +143,7 @@ The real unlock â 128K context on 36GB hardware â is within reach. Pha
|
||||
|
||||
---
|
||||
|
||||
# TurboQuant â Full Knowledge Transfer Report
|
||||
# TurboQuant — Full Knowledge Transfer Report
|
||||
|
||||
**Date:** 2026-03-30
|
||||
**Prepared for:** Frankie's Team (Strago, Cid, Locke, John)
|
||||
@@ -168,7 +153,7 @@ The real unlock â 128K context on 36GB hardware â is within reach. Pha
|
||||
|
||||
## TL;DR
|
||||
|
||||
TurboQuant works. PolarQuant KV cache compression delivers **73% memory savings with 1% prompt overhead**. 128K context on the MacBook becomes viable. Custom Ollama build is deferred (multi-day effort), but the fork's `llama-server` is a ready drop-in. Per-layer adaptive quantization is already implemented. QJL is infrastructure-only â not needed at current compression targets.
|
||||
TurboQuant works. PolarQuant KV cache compression delivers **73% memory savings with 1% prompt overhead**. 128K context on the MacBook becomes viable. Custom Ollama build is deferred (multi-day effort), but the fork's `llama-server` is a ready drop-in. Per-layer adaptive quantization is already implemented. QJL is infrastructure-only — not needed at current compression targets.
|
||||
|
||||
---
|
||||
|
||||
@@ -181,14 +166,14 @@ Impact: Memory budget **increases** from ~27GB to ~31GB usable. Model ceiling im
|
||||
|
||||
---
|
||||
|
||||
## Phase 1 â PolarQuant MVP: COMPLETE â
|
||||
## Phase 1 — PolarQuant MVP: COMPLETE ✅
|
||||
|
||||
### Gate Check (#2): Metal Shaders EXIST
|
||||
The `feature/turboquant-kv-cache` branch has production-quality Metal support:
|
||||
- Flash attention for turbo2/3/4 (all dk variants)
|
||||
- WHT rotation kernels (turbo_fwht_128)
|
||||
- Lloyd-Max codebooks (hardcoded, non-uniform)
|
||||
- Asymmetric K/V (q8_0 Ã turbo mixed)
|
||||
- Asymmetric K/V (q8_0 × turbo mixed)
|
||||
- Runtime optimizations: 4-mag LUT (M4+), sparse V dequant, profiling
|
||||
|
||||
**Note:** Allegro's analysis (checking only `master` branch) incorrectly concluded "NO TurboQuant." The implementation lives on the feature branch.
|
||||
@@ -212,9 +197,9 @@ The `feature/turboquant-kv-cache` branch has production-quality Metal support:
|
||||
|
||||
#### Throughput
|
||||
|
||||
| Config (K/V) | Prompt (pp512) | Î | Generation (tg128) | Î |
|
||||
| Config (K/V) | Prompt (pp512) | Δ | Generation (tg128) | Δ |
|
||||
|:-------------|:---------------|:--|:-------------------|:--|
|
||||
| f16/f16 (baseline) | 304.28 t/s | â | 27.47 t/s | â |
|
||||
| f16/f16 (baseline) | 304.28 t/s | — | 27.47 t/s | — |
|
||||
| **turbo4/turbo4** | **300.00 t/s** | **-1.1%** | **22.45 t/s** | **-11.1%** |
|
||||
| turbo3/turbo3 | 271.07 t/s | -10.7% | 21.07 t/s | -16.6% |
|
||||
| q8_0/turbo4 (asymmetric) | 260.57 t/s | -14.1% | 23.75 t/s | -5.9% |
|
||||
@@ -234,12 +219,12 @@ Measured matches calculated exactly. Zero fragmentation overhead.
|
||||
|
||||
| Scenario | Total Memory | Fits 31GB? |
|
||||
|:---------|:-------------|:-----------|
|
||||
| 27B + f16 KV @ 128K | ~38 GB | â No |
|
||||
| 27B + **turbo4 KV @ 128K** | **~23.4 GB** | **â
Yes (7.6GB headroom)** |
|
||||
| 27B + f16 KV @ 128K | ~38 GB | ❌ No |
|
||||
| 27B + **turbo4 KV @ 128K** | **~23.4 GB** | **✅ Yes (7.6GB headroom)** |
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 â Ollama Integration: PARTIALLY COMPLETE
|
||||
## Phase 2 — Ollama Integration: PARTIALLY COMPLETE
|
||||
|
||||
### What Works
|
||||
- Ollama installation fixed (v0.17.7, running on :11434)
|
||||
@@ -271,7 +256,7 @@ The fork's `llama-server` binary is **already built and working**:
|
||||
- Streaming SSE support
|
||||
- All TurboQuant KV types supported
|
||||
- Per-layer adaptive via TURBO_LAYER_ADAPTIVE env var
|
||||
- Same port/protocol as Ollama â clients don't need to change
|
||||
- Same port/protocol as Ollama — clients don't need to change
|
||||
|
||||
### Outstanding Phase 2 Items for Cid
|
||||
- [ ] Download qwen3.5:27b Q4_K_M model
|
||||
@@ -282,7 +267,7 @@ The fork's `llama-server` binary is **already built and working**:
|
||||
|
||||
---
|
||||
|
||||
## Phase 2.5 â Per-Layer Quantization: ALREADY IMPLEMENTED â
|
||||
## Phase 2.5 — Per-Layer Quantization: ALREADY IMPLEMENTED ✅
|
||||
|
||||
Found in the fork. No additional work needed.
|
||||
|
||||
@@ -306,10 +291,10 @@ Mode benchmarks queued. Uniform turbo4 baseline established. Per-layer modes exp
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 â QJL: ASSESSED, NOT NEEDED â
|
||||
## Phase 3 — QJL: ASSESSED, NOT NEEDED ✅
|
||||
|
||||
### Finding
|
||||
**turbo4 is pure 4-bit PolarQuant** â QJL is NOT active.
|
||||
**turbo4 is pure 4-bit PolarQuant** — QJL is NOT active.
|
||||
|
||||
`TURBO4_USE_4BIT` defaults to 1 in `ggml-common.h`. The legacy 3-bit+QJL path exists but is disabled. QJL infrastructure (sign arrays, WHT transforms, 128x128 projection matrices) is embedded in Metal but referenced by no active kernel.
|
||||
|
||||
@@ -322,7 +307,7 @@ Mode benchmarks queued. Uniform turbo4 baseline established. Per-layer modes exp
|
||||
|
||||
| Repo | Status | Value |
|
||||
|:-----|:-------|:------|
|
||||
| TheTom/llama-cpp-turboquant | **PRIMARY** â production Metal shaders on feature branch | Build from this |
|
||||
| TheTom/llama-cpp-turboquant | **PRIMARY** — production Metal shaders on feature branch | Build from this |
|
||||
| TheTom/turboquant_plus | Python reference + 511 tests | Algorithm verification |
|
||||
| rachittshah/mlx-turboquant | Complete MLX PoC, 2-5x slower (no Metal fusion) | Quality validation reference |
|
||||
| amirzandieh/QJL | Author CUDA (~1500 lines) | Future QJL Metal port reference |
|
||||
@@ -333,12 +318,12 @@ Mode benchmarks queued. Uniform turbo4 baseline established. Per-layer modes exp
|
||||
|
||||
| Risk | Status | Mitigation |
|
||||
|:-----|:-------|:-----------|
|
||||
| Metal shaders missing | â
RESOLVED â they exist | â |
|
||||
| Fork too stale | â
RESOLVED â builds clean | â |
|
||||
| Ollama integration blocked | â ï¸ ACTIVE â multi-day effort | Use llama-server instead |
|
||||
| PPL regression | â¸ï¸ UNTESTED â needs wikitext corpus | Download and test in prod |
|
||||
| tg128 borderline (89% vs 90% threshold) | â ï¸ MINOR â within measurement noise | speed-optimization branch may help |
|
||||
| CPU turbo4 incompatible with Metal | â¹ï¸ LOW â only matters if Metal unavailable | Document; Metal is production path |
|
||||
| Metal shaders missing | ✅ RESOLVED — they exist | — |
|
||||
| Fork too stale | ✅ RESOLVED — builds clean | — |
|
||||
| Ollama integration blocked | ⚠️ ACTIVE — multi-day effort | Use llama-server instead |
|
||||
| PPL regression | ⏸️ UNTESTED — needs wikitext corpus | Download and test in prod |
|
||||
| tg128 borderline (89% vs 90% threshold) | ⚠️ MINOR — within measurement noise | speed-optimization branch may help |
|
||||
| CPU turbo4 incompatible with Metal | ℹ️ LOW — only matters if Metal unavailable | Document; Metal is production path |
|
||||
|
||||
---
|
||||
|
||||
@@ -370,7 +355,7 @@ Step 4: Validate
|
||||
|
||||
Step 5: Run quality matrix (prompts on issue #16)
|
||||
Step 6: John reviews output quality
|
||||
Step 7: If pass â production. If fail â drop to turbo3 or adjust per-layer profile.
|
||||
Step 7: If pass → production. If fail → drop to turbo3 or adjust per-layer profile.
|
||||
```
|
||||
|
||||
---
|
||||
@@ -380,21 +365,21 @@ Step 7: If pass â production. If fail â drop to turbo3 or adjust per-l
|
||||
| # | Title | Status |
|
||||
|:--|:------|:-------|
|
||||
| 1 | Epic: TurboQuant KV Cache Compression | Open (tracker) |
|
||||
| 2 | Metal kernel check | â
Closed â PASS |
|
||||
| 3 | Fork assessment | â
Closed â PASS, M3 Max 36GB |
|
||||
| 4 | Build llama.cpp fork | â
Closed â clean build |
|
||||
| 5 | PolarQuant verification | â
Closed â 5/6 PASS |
|
||||
| 6 | Baseline benchmarks | â
Closed â recorded |
|
||||
| 7 | TurboQuant benchmarks | â
Closed â 73% savings |
|
||||
| 8 | Memory profiling | â
Closed â 0% fragmentation |
|
||||
| 9 | Ollama API check | â
Closed â additive, but diverged |
|
||||
| 10 | Custom Ollama build | â
Closed â deferred, llama-server instead |
|
||||
| 11 | Full test matrix | Open â awaiting production deploy |
|
||||
| 12 | Long-session test | Open â awaiting production deploy |
|
||||
| 13 | Per-layer profiles | â
Closed â already implemented |
|
||||
| 14 | QJL assessment | â
Closed â not needed |
|
||||
| 15 | Upstream watch | Open â ongoing |
|
||||
| 16 | Test prompts | Open â Allegro contributed prompts |
|
||||
| 2 | Metal kernel check | ✅ Closed — PASS |
|
||||
| 3 | Fork assessment | ✅ Closed — PASS, M3 Max 36GB |
|
||||
| 4 | Build llama.cpp fork | ✅ Closed — clean build |
|
||||
| 5 | PolarQuant verification | ✅ Closed — 5/6 PASS |
|
||||
| 6 | Baseline benchmarks | ✅ Closed — recorded |
|
||||
| 7 | TurboQuant benchmarks | ✅ Closed — 73% savings |
|
||||
| 8 | Memory profiling | ✅ Closed — 0% fragmentation |
|
||||
| 9 | Ollama API check | ✅ Closed — additive, but diverged |
|
||||
| 10 | Custom Ollama build | ✅ Closed — deferred, llama-server instead |
|
||||
| 11 | Full test matrix | Open — awaiting production deploy |
|
||||
| 12 | Long-session test | Open — awaiting production deploy |
|
||||
| 13 | Per-layer profiles | ✅ Closed — already implemented |
|
||||
| 14 | QJL assessment | ✅ Closed — not needed |
|
||||
| 15 | Upstream watch | Open — ongoing |
|
||||
| 16 | Test prompts | Open — Allegro contributed prompts |
|
||||
|
||||
**12/16 issues resolved. 4 remaining are production validation tasks for Cid.**
|
||||
|
||||
@@ -407,8 +392,8 @@ Step 7: If pass â production. If fail â drop to turbo3 or adjust per-l
|
||||
|
||||
---
|
||||
|
||||
# TurboQuant Implementation â Build Spec (v2)
|
||||
**Prepared by:** Strago | **Date:** 2026-03-30 | **Updated:** 2026-03-30 (v2 â external review fixes)
|
||||
# TurboQuant Implementation — Build Spec (v2)
|
||||
**Prepared by:** Strago | **Date:** 2026-03-30 | **Updated:** 2026-03-30 (v2 — external review fixes)
|
||||
**Task:** STR-2026-03-30-01 | **For:** Cid (build) + Frankie (coordination)
|
||||
**Inputs read:** turboquant-2026-03-25.md (Google brief), turboquant-2026-03-30-recon-update.md (Locke recon), infra-bulletin.md, MEMORY.md, external Opus review
|
||||
|
||||
@@ -419,64 +404,64 @@ Step 7: If pass â production. If fail â drop to turbo3 or adjust per-l
|
||||
John wants maximum local inference quality on the MacBook Pro (M4 Max, 32GB unified memory) using TurboQuant-level KV cache compression. Currently running `qwen3.5:27b` via Ollama at `10.0.0.133:11434`. The goal: run a larger or better model within the same 32GB memory envelope by compressing the KV cache during inference.
|
||||
|
||||
TurboQuant (Google, ICLR 2026) is a three-stage KV cache compression method:
|
||||
1. **PolarQuant** â random rotation + polar coordinates + fixed scalar codebook. No normalization constants. ~4.2Ã compression.
|
||||
2. **QJL** â 1-bit quantized Johnson-Lindenstrauss on the residual. Zero-overhead bias correction.
|
||||
3. **TurboQuant** â PolarQuant for main signal + QJL for residual = unbiased inner product quantizer at ~3.5 bits/channel with zero accuracy loss.
|
||||
1. **PolarQuant** — random rotation + polar coordinates + fixed scalar codebook. No normalization constants. ~4.2× compression.
|
||||
2. **QJL** — 1-bit quantized Johnson-Lindenstrauss on the residual. Zero-overhead bias correction.
|
||||
3. **TurboQuant** — PolarQuant for main signal + QJL for residual = unbiased inner product quantizer at ~3.5 bits/channel with zero accuracy loss.
|
||||
|
||||
Community status: multiple `llama.cpp` forks, MLX proof-of-concepts, and a vLLM plugin exist. Nothing upstreamed to official `llama.cpp`, MLX, or Ollama yet. Author QJL code is public. Enough is public to build from.
|
||||
|
||||
---
|
||||
|
||||
## 1a. PolarQuant Technical Detail â What Cid Needs to Verify
|
||||
## 1a. PolarQuant Technical Detail — What Cid Needs to Verify
|
||||
|
||||
This section specifies the PolarQuant algorithm concretely so Cid can verify that the community fork implements it correctly. A fork that gets the rotation wrong or uses the wrong codebook boundaries will compress successfully but degrade quality in ways that short PPL benchmarks may not catch â the damage surfaces during long production sessions with sustained context pressure.
|
||||
This section specifies the PolarQuant algorithm concretely so Cid can verify that the community fork implements it correctly. A fork that gets the rotation wrong or uses the wrong codebook boundaries will compress successfully but degrade quality in ways that short PPL benchmarks may not catch — the damage surfaces during long production sessions with sustained context pressure.
|
||||
|
||||
### The Algorithm (per KV vector)
|
||||
|
||||
**Step 1 â Random Rotation (Preconditioning):**
|
||||
**Step 1 — Random Rotation (Preconditioning):**
|
||||
- Apply a fixed random orthogonal rotation to each KV vector before quantization.
|
||||
- The paper uses a **Walsh-Hadamard transform** (WHT) â a structured orthogonal matrix that's O(d log d) to apply, not O(d²) like a dense random matrix.
|
||||
- The paper uses a **Walsh-Hadamard transform** (WHT) — a structured orthogonal matrix that's O(d log d) to apply, not O(d²) like a dense random matrix.
|
||||
- **Why:** Raw KV vectors have non-uniform coordinate distributions (some dimensions carry more energy). WHT spreads energy uniformly across all coordinates, making the post-rotation distribution predictable and concentrated. This is what eliminates the need for per-vector normalization constants.
|
||||
- **Cid verification:** The fork must use a fixed WHT (or equivalent structured orthogonal rotation), not a learned or per-layer rotation. The rotation matrix must be identical at quantization and dequantization. If the fork uses a dense random matrix instead of WHT, it's functionally correct but slower â flag it.
|
||||
- **Cid verification:** The fork must use a fixed WHT (or equivalent structured orthogonal rotation), not a learned or per-layer rotation. The rotation matrix must be identical at quantization and dequantization. If the fork uses a dense random matrix instead of WHT, it's functionally correct but slower — flag it.
|
||||
|
||||
**Step 2 â Polar Coordinate Transform:**
|
||||
**Step 2 — Polar Coordinate Transform:**
|
||||
- After rotation, decompose each vector into **radius** (L2 norm / signal strength) and **angle** (direction on the unit sphere).
|
||||
- The radius is stored at higher precision (FP16 or FP32) â it's one scalar per vector, negligible overhead.
|
||||
- The radius is stored at higher precision (FP16 or FP32) — it's one scalar per vector, negligible overhead.
|
||||
- The angle coordinates are what get quantized. Because WHT made their distribution predictable, you can use a fixed codebook without per-vector calibration.
|
||||
|
||||
**Step 3 â Lloyd-Max Scalar Quantization:**
|
||||
**Step 3 — Lloyd-Max Scalar Quantization:**
|
||||
- Each angle coordinate is independently quantized using a **Lloyd-Max optimal scalar quantizer**.
|
||||
- Lloyd-Max minimizes mean squared error for a known distribution. Because WHT makes the distribution analytically computable, the codebook boundaries are **precomputed once** and fixed for all vectors.
|
||||
- **Codebook sizes by compression target:**
|
||||
- `turbo4` = 4 bits per coordinate = 16 codebook entries per dimension
|
||||
- `turbo3` = 3 bits = 8 entries
|
||||
- `turbo2` = 2 bits = 4 entries
|
||||
- **Cid verification:** Check that the fork's codebook boundaries match what the paper/PolarQuant paper specifies for the target distribution. If the fork uses uniform quantization instead of Lloyd-Max, that's a quality regression â uniform is simpler but wastes bits on low-probability regions.
|
||||
- **Cid verification:** Check that the fork's codebook boundaries match what the paper/PolarQuant paper specifies for the target distribution. If the fork uses uniform quantization instead of Lloyd-Max, that's a quality regression — uniform is simpler but wastes bits on low-probability regions.
|
||||
|
||||
**Step 4 â Bit Packing + Storage:**
|
||||
**Step 4 — Bit Packing + Storage:**
|
||||
- Quantized indices are packed into the KV cache format (turbo2/3/4 nibble-packed).
|
||||
- Radius stored separately. No normalization constants, no scale factors, no zero-points â this is the key advantage over standard quantization.
|
||||
- Radius stored separately. No normalization constants, no scale factors, no zero-points — this is the key advantage over standard quantization.
|
||||
|
||||
### Dequantization During Attention
|
||||
|
||||
When the model computes attention scores (Q·K^T) and weighted values (softmax·V):
|
||||
When the model computes attention scores (Q·K^T) and weighted values (softmax·V):
|
||||
1. Read packed indices from cache
|
||||
2. Look up codebook values (single table lookup per coordinate)
|
||||
3. Reconstruct angle coordinates
|
||||
4. Scale by stored radius
|
||||
5. Compute dot product in reconstructed space
|
||||
|
||||
**Critical property:** The inner product between a full-precision query Q and a PolarQuant-compressed K must be an unbiased estimator of the true Q·K dot product. The WHT rotation preserves this because orthogonal transforms preserve inner products. If the fork adds any non-orthogonal transformation (e.g., learned projection, PCA), the unbiasedness guarantee breaks.
|
||||
**Critical property:** The inner product between a full-precision query Q and a PolarQuant-compressed K must be an unbiased estimator of the true Q·K dot product. The WHT rotation preserves this because orthogonal transforms preserve inner products. If the fork adds any non-orthogonal transformation (e.g., learned projection, PCA), the unbiasedness guarantee breaks.
|
||||
|
||||
### PolarQuant Initialization â Codebook + Rotation Matrix Setup
|
||||
### PolarQuant Initialization — Codebook + Rotation Matrix Setup
|
||||
|
||||
PolarQuant requires two things to be initialized before inference can start:
|
||||
|
||||
1. **Walsh-Hadamard rotation matrix:** This is deterministic â a WHT of size d (model head dimension, typically 128) is computed from the recursive Hadamard construction. It's the same for every session, every model. Compute once at model load, store in memory. Cost: O(d log d) per head dimension â microseconds. No impact on model load time.
|
||||
1. **Walsh-Hadamard rotation matrix:** This is deterministic — a WHT of size d (model head dimension, typically 128) is computed from the recursive Hadamard construction. It's the same for every session, every model. Compute once at model load, store in memory. Cost: O(d log d) per head dimension — microseconds. No impact on model load time.
|
||||
|
||||
2. **Lloyd-Max codebook:** The quantization boundaries are precomputed for the known post-WHT distribution. For a given bit width (turbo4 = 4 bits = 16 entries), the codebook is a fixed lookup table of 16 boundary values + 16 reconstruction values. This is identical across sessions and models of the same head dimension. Can be hardcoded as a constant array or computed once at load time from the analytical distribution formula.
|
||||
|
||||
**Expected initialization overhead:** Negligible â both are small deterministic computations. But **measure it during Phase 1**: time the gap between Ollama receiving a request and the first token appearing, with and without TurboQuant. If initialization adds >1 second to cold model load, investigate caching the tables to disk alongside the model file.
|
||||
**Expected initialization overhead:** Negligible — both are small deterministic computations. But **measure it during Phase 1**: time the gap between Ollama receiving a request and the first token appearing, with and without TurboQuant. If initialization adds >1 second to cold model load, investigate caching the tables to disk alongside the model file.
|
||||
|
||||
**Cid measurement target:** Report model load time (cold start) with and without TurboQuant. If >5 second delta, flag as UX issue.
|
||||
|
||||
@@ -490,9 +475,9 @@ PolarQuant requires two things to be initialized before inference can start:
|
||||
|
||||
---
|
||||
|
||||
## 1. Model Targeting â What Can We Run?
|
||||
## 1. Model Targeting — What Can We Run?
|
||||
|
||||
### Memory Budget â Realistic, Not Theoretical
|
||||
### Memory Budget — Realistic, Not Theoretical
|
||||
|
||||
On a 32GB M4 Max running macOS, you do NOT have 32GB for inference. Realistic budget:
|
||||
|
||||
@@ -504,18 +489,18 @@ On a 32GB M4 Max running macOS, you do NOT have 32GB for inference. Realistic bu
|
||||
| Activation memory (intermediate tensors during forward pass) | ~1-3GB (varies by model/batch) |
|
||||
| **Available for model weights + KV cache** | **~26-28GB** |
|
||||
|
||||
**Use 27GB as the planning ceiling.** The v1 spec said "leaves 2GB for OS" at 30GB peak â that's too tight. All memory calculations below use 27GB available.
|
||||
**Use 27GB as the planning ceiling.** The v1 spec said "leaves 2GB for OS" at 30GB peak — that's too tight. All memory calculations below use 27GB available.
|
||||
|
||||
### Current State (No TurboQuant)
|
||||
- **qwen3.5:27b** at Q4_K_M (~16GB model weights) â fits within 27GB budget with room for KV cache
|
||||
- At 32K context, KV cache for a 27B model at FP16 â 4-6GB â total ~20-22GB. Comfortable.
|
||||
- At 64K context, KV cache â 8-12GB â total ~24-28GB. Marginal â may swap.
|
||||
- At 128K context, KV cache grows to ~16-24GB â doesn't fit. Context-limited.
|
||||
- **qwen3.5:27b** at Q4_K_M (~16GB model weights) — fits within 27GB budget with room for KV cache
|
||||
- At 32K context, KV cache for a 27B model at FP16 ≈ 4-6GB → total ~20-22GB. Comfortable.
|
||||
- At 64K context, KV cache ≈ 8-12GB → total ~24-28GB. Marginal — may swap.
|
||||
- At 128K context, KV cache grows to ~16-24GB → doesn't fit. Context-limited.
|
||||
|
||||
### With TurboQuant (4Ã KV Compression)
|
||||
- KV cache at 32K drops from ~5GB â ~1.2GB
|
||||
- KV cache at 64K drops from ~10GB â ~2.5GB
|
||||
- KV cache at 128K drops from ~20GB â ~5GB
|
||||
### With TurboQuant (4× KV Compression)
|
||||
- KV cache at 32K drops from ~5GB → ~1.2GB
|
||||
- KV cache at 64K drops from ~10GB → ~2.5GB
|
||||
- KV cache at 128K drops from ~20GB → ~5GB
|
||||
- This frees 4-15GB of headroom depending on context length
|
||||
|
||||
**Important:** These are calculated estimates, not measured values. Actual memory consumption can exceed theoretical due to fragmentation, allocation overhead, and implementation-specific buffering. Phase 1 **must** include actual peak memory measurement (see validation section). If measured exceeds calculated by >15%, the context ceiling drops accordingly.
|
||||
@@ -524,31 +509,31 @@ On a 32GB M4 Max running macOS, you do NOT have 32GB for inference. Realistic bu
|
||||
|
||||
**Primary target: qwen3.5:27b at Q4_K_M with extended context**
|
||||
- Model weights: ~16GB at Q4_K_M
|
||||
- With TurboQuant KV cache at 64K context: ~2.5GB cache + ~2GB activations â ~20-21GB total. Comfortable within 27GB budget.
|
||||
- With TurboQuant at 128K: ~5GB cache + ~2GB activations â ~23GB total. Fits, but tight â **needs measured validation.**
|
||||
- Without TurboQuant: 64K context KV cache â 10GB â ~28GB total. OOM risk.
|
||||
- With TurboQuant KV cache at 64K context: ~2.5GB cache + ~2GB activations → ~20-21GB total. Comfortable within 27GB budget.
|
||||
- With TurboQuant at 128K: ~5GB cache + ~2GB activations → ~23GB total. Fits, but tight — **needs measured validation.**
|
||||
- Without TurboQuant: 64K context KV cache ≈ 10GB → ~28GB total. OOM risk.
|
||||
- **Win: 64K context becomes reliable, 128K becomes possible.** This is the real unlock.
|
||||
|
||||
**Stretch target: Qwen 3.5 32B (Q4_K_M)**
|
||||
- Model weights: ~18-19GB at Q4_K_M
|
||||
- With TurboQuant at 64K: ~2.5GB cache + ~2.5GB activations â ~23-24GB. Fits within 27GB but leaves only ~3GB headroom.
|
||||
- With TurboQuant at 64K: ~2.5GB cache + ~2.5GB activations → ~23-24GB. Fits within 27GB but leaves only ~3GB headroom.
|
||||
- **Verdict: worth testing in Phase 1 benchmarks alongside 27B.** If it fits, marginally better quality. If it's marginal, stay on 27B.
|
||||
|
||||
**Not recommended: Qwen 3.5 72B (Q2_K or IQ3_XXS)**
|
||||
- Model weights at Q2_K: ~27GB. Leaves ~0GB for anything else.
|
||||
- **Verdict: does not fit.** Even with TurboQuant, no room for KV cache + activations + Metal overhead. And quality at Q2_K is poor â weight quantization damage cancels the parameter count advantage.
|
||||
- **Verdict: does not fit.** Even with TurboQuant, no room for KV cache + activations + Metal overhead. And quality at Q2_K is poor — weight quantization damage cancels the parameter count advantage.
|
||||
|
||||
**Recommended path: Stay on 27B class, use TurboQuant to unlock longer context (64K-128K) rather than a bigger model.** The real win on 32GB unified is context length, not parameter count. A 27B model at 128K context with TurboQuant beats a 72B at Q2 with 8K context.
|
||||
|
||||
**Alternative worth testing: Mistral/Codestral 25B-class models** at Q5_K_M (~18GB) with TurboQuant. Locke's research notes TurboQuant was benchmarked on Mistral â community results may be more reproducible.
|
||||
**Alternative worth testing: Mistral/Codestral 25B-class models** at Q5_K_M (~18GB) with TurboQuant. Locke's research notes TurboQuant was benchmarked on Mistral — community results may be more reproducible.
|
||||
|
||||
---
|
||||
|
||||
## 2. Implementation Path â PolarQuant First, Then Full TurboQuant
|
||||
## 2. Implementation Path — PolarQuant First, Then Full TurboQuant
|
||||
|
||||
**Recommendation: PolarQuant (Stage 1) first.** Matches Locke's recommendation. Rationale:
|
||||
|
||||
- PolarQuant alone delivers ~4.2Ã compression â that's the bulk of the win
|
||||
- PolarQuant alone delivers ~4.2× compression — that's the bulk of the win
|
||||
- Full TurboQuant adds QJL residual correction for marginal quality improvement at extreme compression (2.5 bits)
|
||||
- At 3.5+ bits/channel, PolarQuant is sufficient for zero accuracy loss
|
||||
- QJL adds kernel complexity for small incremental gain at our target compression ratio
|
||||
@@ -558,36 +543,36 @@ On a 32GB M4 Max running macOS, you do NOT have 32GB for inference. Realistic bu
|
||||
|
||||
| Repo | What | Why | Risk |
|
||||
|------|------|-----|------|
|
||||
| **`TheTom/llama-cpp-turboquant`** | `llama.cpp` fork with Metal support | Most directly useful â same stack as Ollama. Reports PPL numbers on M-series. | Community fork, not upstream. May lag `llama.cpp` HEAD. |
|
||||
| **`TheTom/llama-cpp-turboquant`** | `llama.cpp` fork with Metal support | Most directly useful — same stack as Ollama. Reports PPL numbers on M-series. | Community fork, not upstream. May lag `llama.cpp` HEAD. |
|
||||
| **`TheTom/turboquant_plus`** | Standalone C implementation + Python tests | Most detailed reverse-engineering. 511+ tests. PolarQuant + Walsh-Hadamard + turbo2/3/4 formats. | Extends beyond paper ("Plus"). May include non-paper innovations. |
|
||||
| **`amirzandieh/QJL`** | Author's QJL CUDA implementation | Official author code. CUDA kernels, eval scripts, LongBench commands. | CUDA only â needs Metal port for MacBook. Phase 2 dependency. |
|
||||
| **`amirzandieh/QJL`** | Author's QJL CUDA implementation | Official author code. CUDA kernels, eval scripts, LongBench commands. | CUDA only — needs Metal port for MacBook. Phase 2 dependency. |
|
||||
| **`rachittshah/mlx-turboquant`** | MLX proof-of-concept | Native Apple Silicon. Correct module layout (codebooks, polar_quant, qjl). | May be partial implementation. Naming drift noted. |
|
||||
|
||||
**Start from:** `TheTom/llama-cpp-turboquant` (for Ollama integration path) + `TheTom/turboquant_plus` (for reference/tests).
|
||||
|
||||
### Community Fork Risk Assessment
|
||||
|
||||
The v1 spec understated this. Community `llama.cpp` forks can diverge significantly from HEAD, especially in the Metal backend where Apple Silicon optimizations change frequently. The risk isn't "it doesn't build" â it's "it builds fine on the fork's base commit but breaks when cherry-picked onto current HEAD."
|
||||
The v1 spec understated this. Community `llama.cpp` forks can diverge significantly from HEAD, especially in the Metal backend where Apple Silicon optimizations change frequently. The risk isn't "it doesn't build" — it's "it builds fine on the fork's base commit but breaks when cherry-picked onto current HEAD."
|
||||
|
||||
**Specific risk areas:**
|
||||
- **KV cache layer:** `llama.cpp` has refactored KV cache internals multiple times in 2026. A fork based on a 4-week-old commit may touch structs/functions that have been renamed or restructured upstream.
|
||||
- **Metal shaders:** Apple Silicon Metal optimizations are actively changing. Custom Metal kernels for TurboQuant dequant may conflict with upstream shader refactors.
|
||||
- **Memory management:** `ggml` memory allocation has evolved. The fork's cache allocation assumptions may not match current `ggml` memory pools.
|
||||
|
||||
**Mitigation plan (Phase 1 Step 0 â before any benchmarking):**
|
||||
**Mitigation plan (Phase 1 Step 0 — before any benchmarking):**
|
||||
|
||||
1. **Check fork freshness:** `git log --oneline -1` on the fork. Compare base commit date against `llama.cpp` HEAD. If >4 weeks stale, flag as HIGH risk.
|
||||
2. **If fresh (< 2 weeks from HEAD):** Build directly. Likely works.
|
||||
3. **If stale (2-4 weeks):** Attempt cherry-pick of TurboQuant-specific commits onto current HEAD. If merge conflicts are limited to TurboQuant files â resolve manually. If conflicts touch core KV cache / Metal code â stop, evaluate effort.
|
||||
4. **If very stale (> 4 weeks) or conflicts are extensive:** Switch to **clean-room approach** â use `TheTom/turboquant_plus` as the algorithm reference and implement the KV cache types directly into current `llama.cpp` HEAD. This is more work (~60-90 min instead of ~20-40 min) but avoids the merge conflict maze.
|
||||
3. **If stale (2-4 weeks):** Attempt cherry-pick of TurboQuant-specific commits onto current HEAD. If merge conflicts are limited to TurboQuant files → resolve manually. If conflicts touch core KV cache / Metal code → stop, evaluate effort.
|
||||
4. **If very stale (> 4 weeks) or conflicts are extensive:** Switch to **clean-room approach** — use `TheTom/turboquant_plus` as the algorithm reference and implement the KV cache types directly into current `llama.cpp` HEAD. This is more work (~60-90 min instead of ~20-40 min) but avoids the merge conflict maze.
|
||||
5. **Escape hatch:** If `llama.cpp` path is blocked, fall back to `rachittshah/mlx-turboquant` (MLX native, no fork divergence risk, but requires API proxy for Ollama compatibility).
|
||||
|
||||
**Cid decision point:** After Step 0, report fork age + conflict assessment before proceeding. If clean-room is needed, update the time estimate and Frankie adjusts the schedule. Don't spend more than 15 minutes fighting merge conflicts â switch to clean-room.
|
||||
**Cid decision point:** After Step 0, report fork age + conflict assessment before proceeding. If clean-room is needed, update the time estimate and Frankie adjusts the schedule. Don't spend more than 15 minutes fighting merge conflicts — switch to clean-room.
|
||||
|
||||
### Metal Kernel Risk â The Single Highest-Risk Assumption
|
||||
### Metal Kernel Risk — The Single Highest-Risk Assumption
|
||||
|
||||
The spec assumes the `llama.cpp` fork has working **Metal shaders** for PolarQuant KV dequantization. KV dequant happens in the attention computation hot path â every token, every layer, every head. If the fork only has CPU or CUDA dequant kernels and no Metal implementation, the MacBook will either:
|
||||
- Fall back to CPU dequant â **catastrophic** performance loss (10-50Ã slower attention)
|
||||
The spec assumes the `llama.cpp` fork has working **Metal shaders** for PolarQuant KV dequantization. KV dequant happens in the attention computation hot path — every token, every layer, every head. If the fork only has CPU or CUDA dequant kernels and no Metal implementation, the MacBook will either:
|
||||
- Fall back to CPU dequant → **catastrophic** performance loss (10-50× slower attention)
|
||||
- Fail to build entirely for Metal backend
|
||||
|
||||
**Cid's actual first action** (before building, before benchmarking, before anything):
|
||||
@@ -612,12 +597,12 @@ This check takes 2 minutes and determines the entire build strategy. Do it first
|
||||
|
||||
---
|
||||
|
||||
## 3. Integration Target â llama.cpp â Ollama
|
||||
## 3. Integration Target — llama.cpp → Ollama
|
||||
|
||||
**Primary: `llama.cpp` fork â custom Ollama build.**
|
||||
**Primary: `llama.cpp` fork → custom Ollama build.**
|
||||
|
||||
Why not MLX:
|
||||
- Our entire fleet uses Ollama. Model management, API compatibility, endpoint routing â all built around Ollama.
|
||||
- Our entire fleet uses Ollama. Model management, API compatibility, endpoint routing — all built around Ollama.
|
||||
- MLX would require a separate inference server, separate model format, separate API integration.
|
||||
- Ollama is built on `llama.cpp`/`ggml`. KV cache changes in `llama.cpp` propagate to Ollama.
|
||||
|
||||
@@ -626,13 +611,13 @@ Why not MLX:
|
||||
2. Validate quality + performance
|
||||
3. Build custom Ollama from our `llama.cpp` fork (Ollama builds `llama.cpp` as a submodule)
|
||||
4. Deploy to MacBook as replacement Ollama binary
|
||||
5. Existing model files, API, and endpoint (`10.0.0.133:11434`) remain identical â only the inference engine changes
|
||||
5. Existing model files, API, and endpoint (`10.0.0.133:11434`) remain identical — only the inference engine changes
|
||||
|
||||
**Fallback: MLX standalone** if `llama.cpp` Metal integration proves too complex. `rachittshah/mlx-turboquant` as starting point. Would require a small proxy server to maintain API compatibility with our Ollama endpoint.
|
||||
|
||||
---
|
||||
|
||||
## 4. Validation Plan â How We Know It Works
|
||||
## 4. Validation Plan — How We Know It Works
|
||||
|
||||
### Quality Validation
|
||||
|
||||
@@ -640,24 +625,24 @@ Why not MLX:
|
||||
|
||||
| Test | What It Measures | Tool | Pass Criteria |
|
||||
|------|-----------------|------|--------------|
|
||||
| Perplexity (PPL) | Overall language modeling quality | `llama-perplexity` on WikiText-2 | PPL delta ⤠0.5 from baseline (FP16 KV) |
|
||||
| Perplexity (PPL) | Overall language modeling quality | `llama-perplexity` on WikiText-2 | PPL delta ≤ 0.5 from baseline (FP16 KV) |
|
||||
| Needle-in-Haystack | Long context retrieval | Custom prompt at 8K/16K/32K/64K/128K | 100% retrieval at all lengths where baseline passes |
|
||||
| Practical generation | Subjective quality | 10 predefined prompts (see test suite below) | Human review: no degradation on â¥9/10 |
|
||||
| Attention score accuracy | Inner product preservation | Cosine similarity between TurboQuant and FP16 attention weights | cosine sim ⥠0.995 |
|
||||
| Practical generation | Subjective quality | 10 predefined prompts (see test suite below) | Human review: no degradation on ≥9/10 |
|
||||
| Attention score accuracy | Inner product preservation | Cosine similarity between TurboQuant and FP16 attention weights | cosine sim ≥ 0.995 |
|
||||
|
||||
**Predefined Test Prompts (10 prompts, run identically on TurboQuant and FP16 KV baseline):**
|
||||
|
||||
| # | Category | Prompt Description | What It Tests |
|
||||
|---|----------|-------------------|---------------|
|
||||
| 1 | Long-context summarization | Feed 20K tokens of a research paper, ask for structured summary with citations | KV cache quality at length â compressed K/V must retain source detail |
|
||||
| 1 | Long-context summarization | Feed 20K tokens of a research paper, ask for structured summary with citations | KV cache quality at length — compressed K/V must retain source detail |
|
||||
| 2 | Multi-step reasoning | 5-step math word problem requiring chain-of-thought | Whether compressed KV degrades intermediate reasoning steps |
|
||||
| 3 | Code generation | Write a Python script with 3 functions, error handling, type hints | Precise token prediction â code is unforgiving of subtle quality drops |
|
||||
| 4 | Code debugging | Provide buggy code (3 bugs), ask to identify and fix all three | Attention to detail across context â must reference earlier code correctly |
|
||||
| 3 | Code generation | Write a Python script with 3 functions, error handling, type hints | Precise token prediction — code is unforgiving of subtle quality drops |
|
||||
| 4 | Code debugging | Provide buggy code (3 bugs), ask to identify and fix all three | Attention to detail across context — must reference earlier code correctly |
|
||||
| 5 | Factual recall (early context) | Provide 10 facts in the first 1K tokens, continue for 8K tokens of filler, ask about fact #3 | Retrieval from early context through compressed KV |
|
||||
| 6 | Creative writing | Write a 500-word short story with specific constraints (setting, character, twist) | Compression artifacts surface as repetition or coherence loss |
|
||||
| 7 | Multi-turn conversation | 10-turn technical Q&A where later questions reference earlier answers | Cross-turn coherence through accumulated compressed KV |
|
||||
| 8 | Structured output | Generate a JSON schema with 15+ fields, nested objects, and validation rules | Format precision â compressed KV must maintain structural consistency |
|
||||
| 9 | Translation + analysis | Translate a paragraph ENâES, then analyze the translation choices | Tests both generation quality and meta-reasoning about own output |
|
||||
| 8 | Structured output | Generate a JSON schema with 15+ fields, nested objects, and validation rules | Format precision — compressed KV must maintain structural consistency |
|
||||
| 9 | Translation + analysis | Translate a paragraph EN→ES, then analyze the translation choices | Tests both generation quality and meta-reasoning about own output |
|
||||
| 10 | Instruction following | Complex prompt with 8 specific formatting requirements (headers, bullet style, word limits, etc.) | Whether compression causes the model to "forget" constraints mid-generation |
|
||||
|
||||
**Prompts must be written and saved to `projects/sovereign-stack/turboquant-test-prompts.md` before Phase 2 benchmarks run.** Same prompts, same order, both configurations. This prevents unconscious cherry-picking.
|
||||
@@ -665,7 +650,7 @@ Why not MLX:
|
||||
**Asymmetric K/V test:** Run K at Q8_0, V at turbo4. Community reports this works well on sensitive models. Compare PPL vs symmetric turbo4 K+V.
|
||||
|
||||
**Long-session quality test (Phase 2 only):** Short-context PPL benchmarks can miss quality degradation that surfaces during sustained context pressure. During Phase 2, run one extended production simulation:
|
||||
- Generate a 50-turn multi-step reasoning conversation (code gen â debug â refactor â test â iterate)
|
||||
- Generate a 50-turn multi-step reasoning conversation (code gen → debug → refactor → test → iterate)
|
||||
- Compare output quality vs same conversation on FP16 KV baseline
|
||||
- Specifically watch for: coherence drift after turn 30+, hallucinated references to earlier context, attention score softmax concentration (if measurable)
|
||||
- This catches the case where codebook boundary errors accumulate over many KV cache writes in a single session
|
||||
@@ -674,16 +659,16 @@ Why not MLX:
|
||||
|
||||
| Metric | Measure | Pass Criteria |
|
||||
|--------|---------|--------------|
|
||||
| Tokens/second (generation) | `llama-bench` | â¥90% of baseline tok/s (small decode overhead acceptable) |
|
||||
| Time to first token (TTFT) | Timed prompt eval | â¤110% of baseline |
|
||||
| Tokens/second (generation) | `llama-bench` | ≥90% of baseline tok/s (small decode overhead acceptable) |
|
||||
| Time to first token (TTFT) | Timed prompt eval | ≤110% of baseline |
|
||||
| Peak resident memory | `footprint -p <pid>` or `vmmap --summary` at each context length | Stays under 27GB at target context length |
|
||||
| Memory vs theoretical | Compare measured peak to calculated estimate | If measured exceeds calculated by >15% â reduce context ceiling |
|
||||
| Memory vs theoretical | Compare measured peak to calculated estimate | If measured exceeds calculated by >15% → reduce context ceiling |
|
||||
| Context length ceiling | Binary search: max context before OOM or swap pressure | 64K minimum (vs ~32K baseline for 27B) |
|
||||
|
||||
### Kill Criteria
|
||||
- PPL regression > 1.0 at any compression level â abort that compression level
|
||||
- OOM at 32K context (baseline capability) â regression, abort
|
||||
- tok/s drops > 25% â dequant overhead too high, need kernel optimization before deploy
|
||||
- PPL regression > 1.0 at any compression level → abort that compression level
|
||||
- OOM at 32K context (baseline capability) → regression, abort
|
||||
- tok/s drops > 25% → dequant overhead too high, need kernel optimization before deploy
|
||||
|
||||
---
|
||||
|
||||
@@ -691,7 +676,7 @@ Why not MLX:
|
||||
|
||||
| Role | Owner | What |
|
||||
|------|-------|------|
|
||||
| Build spec | Strago | This document â
|
|
||||
| Build spec | Strago | This document ✅ |
|
||||
| Implementation | Cid | Fork `llama.cpp`, integrate PolarQuant KV cache, Metal kernels, build custom Ollama |
|
||||
| Validation | Cid | Run test matrix, report PPL/performance numbers |
|
||||
| Model selection | Cid | Test qwen3.5:27b + one Mistral variant, recommend best config |
|
||||
@@ -703,48 +688,48 @@ Why not MLX:
|
||||
|
||||
## 6. Phasing
|
||||
|
||||
### Phase 1 â PolarQuant MVP (Target: this week)
|
||||
### Phase 1 — PolarQuant MVP (Target: this week)
|
||||
|
||||
**Scope:**
|
||||
|
||||
**Step 0 â Fork Assessment (do this FIRST, report before proceeding):**
|
||||
**Step 0 — Fork Assessment (do this FIRST, report before proceeding):**
|
||||
- Clone `TheTom/llama-cpp-turboquant`
|
||||
- Check base commit age vs `llama.cpp` HEAD (`git log --oneline -1`)
|
||||
- Check `sysctl hw.memsize` on MacBook (resolve the 32/36/48GB question)
|
||||
- If fork < 2 weeks stale â proceed to build
|
||||
- If 2-4 weeks stale â attempt cherry-pick, report conflict scope
|
||||
- If > 4 weeks or conflicts extensive â switch to clean-room (see Fork Risk Assessment above)
|
||||
- If fork < 2 weeks stale → proceed to build
|
||||
- If 2-4 weeks stale → attempt cherry-pick, report conflict scope
|
||||
- If > 4 weeks or conflicts extensive → switch to clean-room (see Fork Risk Assessment above)
|
||||
- Report: fork age, conflict assessment, MacBook actual RAM, estimated build path time
|
||||
|
||||
**Step 1 â Build + Verify:**
|
||||
**Step 1 — Build + Verify:**
|
||||
- Build `llama.cpp` fork (or clean-room) with Metal backend on MacBook (M4 Max)
|
||||
- Run the Section 1a verification checklist against the fork's implementation before trusting any benchmarks
|
||||
- Run FP16 KV baseline: `llama-perplexity` on WikiText-2 with `qwen3.5:27b` at 8K context (this is the number we're comparing against)
|
||||
|
||||
**Step 2 â Benchmark PolarQuant:**
|
||||
**Step 2 — Benchmark PolarQuant:**
|
||||
- Run perplexity test with PolarQuant KV (turbo4 format) vs FP16 KV baseline
|
||||
- Run `llama-bench` for tok/s comparison
|
||||
- Test at 8K, 32K, and 64K context lengths
|
||||
- Run asymmetric test: K at Q8_0, V at turbo4
|
||||
- **Measure actual peak resident memory** at each context length (`footprint -p <pid>` or `vmmap --summary`). Compare measured vs calculated. If measured exceeds calculated by >15%, note the delta â it reduces the achievable context ceiling.
|
||||
- **Measure actual peak resident memory** at each context length (`footprint -p <pid>` or `vmmap --summary`). Compare measured vs calculated. If measured exceeds calculated by >15%, note the delta — it reduces the achievable context ceiling.
|
||||
- Report: PPL delta per context length, tok/s delta, **measured peak memory per context length**, max context before OOM/swap, asymmetric vs symmetric results
|
||||
|
||||
**Deliverable:** Working `llama.cpp` build on MacBook with PolarQuant KV cache. PPL + performance numbers. Section 1a verification checklist completed.
|
||||
|
||||
**Estimated Cid time (honest range):**
|
||||
- **Best case** â fork is fresh, builds clean on first try, Metal shaders work: 20-40 min
|
||||
- **Typical case** â fork needs CMake flag tweaks, Xcode SDK adjustments, minor Metal fixes: 1-2 hours
|
||||
- **Worst case** â fork is stale, conflicts extensive, or Metal shaders missing: clean-room build 2-4 hours, or MLX pivot
|
||||
- **Best case** — fork is fresh, builds clean on first try, Metal shaders work: 20-40 min
|
||||
- **Typical case** — fork needs CMake flag tweaks, Xcode SDK adjustments, minor Metal fixes: 1-2 hours
|
||||
- **Worst case** — fork is stale, conflicts extensive, or Metal shaders missing: clean-room build 2-4 hours, or MLX pivot
|
||||
|
||||
**2-hour build troubleshooting cap:** If the `llama.cpp` fork doesn't compile and pass a basic smoke test (load model, generate 10 tokens) within 2 hours of troubleshooting, stop. Pivot to MLX path. Don't sink more time into Xcode/CMake/Metal debug loops when a working MLX PoC exists. Report what broke â the information is useful even if the path is abandoned.
|
||||
**2-hour build troubleshooting cap:** If the `llama.cpp` fork doesn't compile and pass a basic smoke test (load model, generate 10 tokens) within 2 hours of troubleshooting, stop. Pivot to MLX path. Don't sink more time into Xcode/CMake/Metal debug loops when a working MLX PoC exists. Report what broke — the information is useful even if the path is abandoned.
|
||||
|
||||
**Decision gate:** If PPL delta ⤠0.5 and tok/s ⥠90% baseline AND Section 1a checklist passes â proceed to Phase 2. If PPL fails but checklist passes â try asymmetric K/V or lower compression (turbo3 instead of turbo4). If checklist fails â fix implementation before trusting benchmarks.
|
||||
**Decision gate:** If PPL delta ≤ 0.5 and tok/s ≥ 90% baseline AND Section 1a checklist passes → proceed to Phase 2. If PPL fails but checklist passes → try asymmetric K/V or lower compression (turbo3 instead of turbo4). If checklist fails → fix implementation before trusting benchmarks.
|
||||
|
||||
### Phase 2 â Ollama Integration + Production Deploy
|
||||
### Phase 2 — Ollama Integration + Production Deploy
|
||||
|
||||
**Scope:**
|
||||
|
||||
**Step 0 â Ollama API Compatibility Check (before building):**
|
||||
**Step 0 — Ollama API Compatibility Check (before building):**
|
||||
Ollama pins a specific `llama.cpp` commit and calls it through CGo bindings in `llm/`. If our fork changes any function signatures, struct layouts, or enum values that Ollama's Go code references, the build will either fail or produce subtle runtime bugs.
|
||||
|
||||
```bash
|
||||
@@ -776,7 +761,7 @@ If API surface differs: check if TurboQuant changes are additive (new functions/
|
||||
|
||||
**Estimated Cid time:** 15-25 min (Ollama build is straightforward once `llama.cpp` fork is validated).
|
||||
|
||||
### Phase 2.5 â Per-Layer Quantization Profiles (Optimization, Optional)
|
||||
### Phase 2.5 — Per-Layer Quantization Profiles (Optimization, Optional)
|
||||
|
||||
Not all transformer layers have equal sensitivity to KV cache quantization. Research and community experimentation show early layers (first 2-4) and late layers (last 2-4) tend to be more sensitive than middle layers. If the fork supports per-layer KV cache type configuration:
|
||||
|
||||
@@ -789,19 +774,19 @@ This gives the same average compression ratio as uniform turbo4 but concentrates
|
||||
|
||||
**Cid note:** During Phase 1, check whether the fork exposes per-layer KV type config. If it does, note it for later. Don't implement it yet.
|
||||
|
||||
### Phase 3 â QJL Residual Correction (Optional)
|
||||
### Phase 3 — QJL Residual Correction (Optional)
|
||||
|
||||
**Scope:** Add QJL 1-bit residual correction for full TurboQuant behavior. Only pursue if:
|
||||
- Phase 1/2 PolarQuant shows quality gaps at extreme compression (< 3 bits/channel)
|
||||
- We want to push to 2.5 bits/channel for even more context headroom
|
||||
|
||||
**Source:** `amirzandieh/QJL` repo (CUDA â Metal port needed)
|
||||
**Source:** `amirzandieh/QJL` repo (CUDA → Metal port needed)
|
||||
|
||||
**Estimated Cid time:** 30-60 min (Metal port of QJL kernels is real engineering work)
|
||||
|
||||
**Decision gate:** Only proceed if PolarQuant alone doesn't meet quality bar at target compression.
|
||||
|
||||
### Phase 4 â Upstream Watch
|
||||
### Phase 4 — Upstream Watch
|
||||
|
||||
**Scope:** Monitor `llama.cpp` upstream and Ollama for official TurboQuant support. When it lands:
|
||||
- Evaluate upstream implementation vs our fork
|
||||
@@ -814,10 +799,10 @@ This gives the same average compression ratio as uniform turbo4 but concentrates
|
||||
|
||||
## What This Spec Does NOT Cover
|
||||
|
||||
- **Weight quantization** â TurboQuant is KV cache compression only. Model weight quantization (GGUF Q4_K_M etc.) is a separate concern and already handled by Ollama.
|
||||
- **Predator (desktop) deployment** â this spec targets MacBook only. Predator runs NVIDIA (CUDA) which is a different kernel backend. Can extend later.
|
||||
- **Multi-model serving** â TurboQuant helps with single-model memory but doesn't change Ollama's single-model-at-a-time constraint.
|
||||
- **Ollama upstream contribution** â out of scope for now. We build for ourselves first.
|
||||
- **Weight quantization** — TurboQuant is KV cache compression only. Model weight quantization (GGUF Q4_K_M etc.) is a separate concern and already handled by Ollama.
|
||||
- **Predator (desktop) deployment** — this spec targets MacBook only. Predator runs NVIDIA (CUDA) which is a different kernel backend. Can extend later.
|
||||
- **Multi-model serving** — TurboQuant helps with single-model memory but doesn't change Ollama's single-model-at-a-time constraint.
|
||||
- **Ollama upstream contribution** — out of scope for now. We build for ourselves first.
|
||||
|
||||
---
|
||||
|
||||
@@ -825,7 +810,7 @@ This gives the same average compression ratio as uniform turbo4 but concentrates
|
||||
|
||||
**None blocking.** One informational:
|
||||
|
||||
- **MacBook Pro memory:** Confirmed M4 Max 32GB from memory/2026-03-14.md. If it's actually 36GB or 48GB (M4 Max comes in 36/48/128 configs), that changes the model ceiling. Can Cid check `sysctl hw.memsize` on the MacBook during Phase 1? Non-blocking â doesn't change the approach, just the model size ceiling.
|
||||
- **MacBook Pro memory:** Confirmed M4 Max 32GB from memory/2026-03-14.md. If it's actually 36GB or 48GB (M4 Max comes in 36/48/128 configs), that changes the model ceiling. Can Cid check `sysctl hw.memsize` on the MacBook during Phase 1? Non-blocking — doesn't change the approach, just the model size ceiling.
|
||||
|
||||
---
|
||||
|
||||
@@ -850,8 +835,8 @@ This gives the same average compression ratio as uniform turbo4 but concentrates
|
||||
|
||||
- **v1 (2026-03-30 12:26 ET):** Initial spec.
|
||||
- **v2 (2026-03-30 12:55 ET):** Added Section 1a (PolarQuant technical detail + Cid verification checklist), expanded fork risk assessment with mitigation plan, added Phase 1 Step 0 (fork assessment before benchmarking), added long-session quality test for Phase 2, updated Phase 1 time estimate for clean-room path. Changes driven by external Opus review round 1.
|
||||
- **v2.1 (2026-03-30 13:00 ET):** Added Metal kernel risk check (grep before build â determines llama.cpp vs MLX primary path), corrected memory budget (27GB available, not 30GB â accounts for OS + Metal driver + activations), added measured memory profiling requirement to Phase 1, added Ollama CGo API compatibility check to Phase 2 Step 0, tightened model ceiling estimates. Changes driven by external Opus review round 2.
|
||||
- **v2.2 (2026-03-30 13:05 ET):** Added honest time estimate range (20 min best â 2-4 hr worst), 2-hour build troubleshooting cap before MLX pivot, PolarQuant initialization detail (WHT + Lloyd-Max codebook setup + cold-start measurement target), 10 predefined test prompts with rationale (prevents cherry-picking), per-layer quantization profiles as Phase 2.5 optimization path. Changes driven by external Opus review round 3.
|
||||
- **v2.1 (2026-03-30 13:00 ET):** Added Metal kernel risk check (grep before build — determines llama.cpp vs MLX primary path), corrected memory budget (27GB available, not 30GB — accounts for OS + Metal driver + activations), added measured memory profiling requirement to Phase 1, added Ollama CGo API compatibility check to Phase 2 Step 0, tightened model ceiling estimates. Changes driven by external Opus review round 2.
|
||||
- **v2.2 (2026-03-30 13:05 ET):** Added honest time estimate range (20 min best → 2-4 hr worst), 2-hour build troubleshooting cap before MLX pivot, PolarQuant initialization detail (WHT + Lloyd-Max codebook setup + cold-start measurement target), 10 predefined test prompts with rationale (prevents cherry-picking), per-layer quantization profiles as Phase 2.5 optimization path. Changes driven by external Opus review round 3.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
# TurboQuant Living Status Tracker
|
||||
|
||||
Updated on each milestone. See PROJECT_STATUS.md for detailed phase reports.
|
||||
|
||||
## Quick Status
|
||||
|
||||
| Phase | Status | Last Updated | Issue |
|
||||
|-------|--------|-------------|-------|
|
||||
| Phase 1: PolarQuant MVP | DONE | 2026-03-30 | #17 |
|
||||
| Phase 2: KV Cache Compression | IN PROGRESS | 2026-04-15 | #99 |
|
||||
| Edge Crisis Detection | DONE | 2026-04-15 | #102 |
|
||||
| Integration PR (upstream llama.cpp) | NOT STARTED | — | — |
|
||||
| QJL Quantization | NOT STARTED | — | — |
|
||||
| Ollama Integration | NOT STARTED | — | — |
|
||||
| Benchmark Suite | IN PROGRESS | 2026-04-13 | #12 |
|
||||
|
||||
## Phase Details
|
||||
|
||||
### Phase 1: PolarQuant MVP — COMPLETE
|
||||
- PolarQuant KV cache compression working on Apple Silicon
|
||||
- 73% KV memory savings, 1% prompt overhead, 11% generation overhead
|
||||
- Metal shaders: flash attention, WHT rotation, codebooks
|
||||
- Hardware: M3 Max 36GB (corrected from spec)
|
||||
- Gate Check #2: PASSED
|
||||
|
||||
### Phase 2: Edge Deployment — COMPLETE
|
||||
- Crisis detection on edge devices (Pi 4, old phones)
|
||||
- Keyword + model (gemma2:2b) + offline resources
|
||||
- Deployment guide, model selection, resource cache
|
||||
- See docs/edge-crisis-deployment.md
|
||||
|
||||
### Phase 3: Upstream Integration — NOT STARTED
|
||||
- PR to llama.cpp for turbo quantization
|
||||
- Depends on Phase 2 benchmarks
|
||||
|
||||
### Phase 4: QJL — NOT STARTED
|
||||
- Johnson-Lindenstrauss quantization
|
||||
- Lower memory than PolarQuant
|
||||
- Research phase
|
||||
|
||||
## Recent Milestones
|
||||
|
||||
| Date | Milestone | PR/Issue |
|
||||
|------|-----------|----------|
|
||||
| 2026-04-15 | Edge crisis detection deployed | #102 / PR #111 |
|
||||
| 2026-04-14 | KV cache compression profiles | PR #68 |
|
||||
| 2026-04-13 | Benchmark suite expanded | #12 / #39 |
|
||||
| 2026-03-30 | Phase 1 complete: PolarQuant MVP | #17 |
|
||||
|
||||
## Open Blockers
|
||||
|
||||
| Blocker | Impact | Issue |
|
||||
|---------|--------|-------|
|
||||
| None currently | — | — |
|
||||
|
||||
---
|
||||
|
||||
*Last auto-updated: 2026-04-15*
|
||||
*This file is the single source of truth for project status.*
|
||||
*Update it on every milestone merge.*
|
||||
548
evolution/quant_selector.py
Normal file
548
evolution/quant_selector.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""Auto-select TurboQuant compression level based on available VRAM/RAM.
|
||||
|
||||
Detects hardware resources at startup and picks the highest quality
|
||||
quantization level that fits within available memory. Supports Apple
|
||||
Silicon unified memory, NVIDIA GPUs (via nvidia-smi), and CPU-only fallback.
|
||||
|
||||
Usage:
|
||||
from evolution.quant_selector import select_quant_level
|
||||
|
||||
selection = select_quant_level(model_size_gb=14.0, context_length=32768)
|
||||
print(selection.level) # "turbo4"
|
||||
print(selection.reasoning) # "M4 Max 36GB unified: turbo4 fits 14.0GB model + ..."
|
||||
print(selection.env_vars) # {"TURBO_LAYER_ADAPTIVE": "7"}
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Quant Level Definitions ───────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class QuantLevel:
|
||||
"""A TurboQuant compression level with its memory characteristics."""
|
||||
name: str # e.g. "turbo4"
|
||||
bits_per_channel: float # e.g. 3.5 for turbo4
|
||||
compression_ratio: float # vs uncompressed KV cache
|
||||
quality_label: str # "best", "high", "balanced", "fast"
|
||||
layer_adaptive: int # TURBO_LAYER_ADAPTIVE value (0-7)
|
||||
kv_type: str # -ctk/-ctv flag value
|
||||
min_memory_headroom_gb: float # Minimum free memory to recommend this level
|
||||
description: str = ""
|
||||
|
||||
|
||||
# Ordered from highest quality to most aggressive compression
|
||||
QUANT_LEVELS = [
|
||||
QuantLevel(
|
||||
name="turbo4",
|
||||
bits_per_channel=3.5,
|
||||
compression_ratio=4.2,
|
||||
quality_label="best",
|
||||
layer_adaptive=7,
|
||||
kv_type="turbo4",
|
||||
min_memory_headroom_gb=4.0,
|
||||
description="PolarQuant + QJL 4-bit. Best quality, ~4.2x KV compression."
|
||||
),
|
||||
QuantLevel(
|
||||
name="turbo3",
|
||||
bits_per_channel=2.5,
|
||||
compression_ratio=6.0,
|
||||
quality_label="high",
|
||||
layer_adaptive=5,
|
||||
kv_type="turbo3",
|
||||
min_memory_headroom_gb=3.0,
|
||||
description="3-bit TurboQuant. High quality, ~6x KV compression."
|
||||
),
|
||||
QuantLevel(
|
||||
name="turbo2",
|
||||
bits_per_channel=1.5,
|
||||
compression_ratio=10.0,
|
||||
quality_label="balanced",
|
||||
layer_adaptive=3,
|
||||
kv_type="turbo2",
|
||||
min_memory_headroom_gb=2.0,
|
||||
description="2-bit TurboQuant. Balanced, ~10x KV compression."
|
||||
),
|
||||
QuantLevel(
|
||||
name="q4_0",
|
||||
bits_per_channel=4.0,
|
||||
compression_ratio=3.5,
|
||||
quality_label="fast",
|
||||
layer_adaptive=0,
|
||||
kv_type="q4_0",
|
||||
min_memory_headroom_gb=1.5,
|
||||
description="Standard 4-bit quant. Fast fallback, no TurboQuant."
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# ── Hardware Detection ────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class HardwareInfo:
|
||||
"""Detected hardware resources."""
|
||||
total_memory_gb: float
|
||||
available_memory_gb: float
|
||||
gpu_memory_gb: Optional[float] = None
|
||||
gpu_name: Optional[str] = None
|
||||
is_apple_silicon: bool = False
|
||||
chip_name: Optional[str] = None
|
||||
cpu_cores: int = 0
|
||||
detection_method: str = ""
|
||||
|
||||
|
||||
def detect_hardware() -> HardwareInfo:
|
||||
"""Detect available memory and GPU resources."""
|
||||
system = platform.system()
|
||||
|
||||
if system == "Darwin":
|
||||
return _detect_apple_silicon()
|
||||
elif system == "Linux":
|
||||
return _detect_linux()
|
||||
else:
|
||||
return _detect_generic(system)
|
||||
|
||||
|
||||
def _detect_apple_silicon() -> HardwareInfo:
|
||||
"""Detect Apple Silicon unified memory."""
|
||||
info = HardwareInfo(
|
||||
total_memory_gb=0,
|
||||
available_memory_gb=0,
|
||||
is_apple_silicon=True,
|
||||
detection_method="sysctl",
|
||||
)
|
||||
|
||||
try:
|
||||
# Get total memory
|
||||
result = subprocess.run(
|
||||
["sysctl", "-n", "hw.memsize"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
info.total_memory_gb = int(result.stdout.strip()) / (1024**3)
|
||||
|
||||
# Get chip name
|
||||
result = subprocess.run(
|
||||
["sysctl", "-n", "machdep.cpu.brand_string"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
info.chip_name = result.stdout.strip()
|
||||
|
||||
# Try to get GPU name (Apple Silicon)
|
||||
result = subprocess.run(
|
||||
["system_profiler", "SPDisplaysDataType"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split("\n"):
|
||||
if "Chipset" in line or "GPU" in line:
|
||||
info.gpu_name = line.split(":")[-1].strip()
|
||||
break
|
||||
|
||||
# Estimate available memory (vm_stat)
|
||||
result = subprocess.run(
|
||||
["vm_stat"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
page_size = 4096 # macOS default
|
||||
free_pages = 0
|
||||
for line in result.stdout.split("\n"):
|
||||
if "Pages free:" in line:
|
||||
try:
|
||||
free_pages = int(line.split(":")[-1].strip().rstrip("."))
|
||||
except ValueError:
|
||||
pass
|
||||
# Available ≈ free + some speculative (conservative: just free)
|
||||
info.available_memory_gb = (free_pages * page_size) / (1024**3)
|
||||
|
||||
# Fallback if vm_stat parsing failed
|
||||
if info.available_memory_gb < 1:
|
||||
# Conservative: 70% of total
|
||||
info.available_memory_gb = info.total_memory_gb * 0.70
|
||||
|
||||
# Apple Silicon shares memory — GPU memory = total memory
|
||||
info.gpu_memory_gb = info.total_memory_gb
|
||||
|
||||
# Detect CPU cores
|
||||
result = subprocess.run(
|
||||
["sysctl", "-n", "hw.ncpu"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
info.cpu_cores = int(result.stdout.strip())
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Apple Silicon detection failed: {e}")
|
||||
# Fallback
|
||||
info.total_memory_gb = 16.0
|
||||
info.available_memory_gb = 12.0
|
||||
info.detection_method = "fallback"
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def _detect_linux() -> HardwareInfo:
|
||||
"""Detect Linux system with optional NVIDIA GPU."""
|
||||
info = HardwareInfo(
|
||||
total_memory_gb=0,
|
||||
available_memory_gb=0,
|
||||
detection_method="proc",
|
||||
)
|
||||
|
||||
try:
|
||||
# Read /proc/meminfo
|
||||
with open("/proc/meminfo", "r") as f:
|
||||
meminfo = f.read()
|
||||
|
||||
for line in meminfo.split("\n"):
|
||||
if line.startswith("MemTotal:"):
|
||||
kb = int(line.split()[1])
|
||||
info.total_memory_gb = kb / (1024 * 1024)
|
||||
elif line.startswith("MemAvailable:"):
|
||||
kb = int(line.split()[1])
|
||||
info.available_memory_gb = kb / (1024 * 1024)
|
||||
|
||||
# CPU cores
|
||||
info.cpu_cores = os.cpu_count() or 1
|
||||
|
||||
# Check for NVIDIA GPU
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if lines:
|
||||
parts = lines[0].split(", ")
|
||||
if len(parts) >= 3:
|
||||
info.gpu_name = parts[0].strip()
|
||||
info.gpu_memory_gb = float(parts[1]) / 1024 # MB to GB
|
||||
gpu_free = float(parts[2]) / 1024
|
||||
# Use GPU free for VRAM-based selection
|
||||
info.available_memory_gb = max(info.available_memory_gb, gpu_free)
|
||||
info.detection_method = "nvidia-smi"
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass # No NVIDIA GPU
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Linux detection failed: {e}")
|
||||
info.total_memory_gb = 16.0
|
||||
info.available_memory_gb = 12.0
|
||||
info.detection_method = "fallback"
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def _detect_generic(system: str) -> HardwareInfo:
|
||||
"""Fallback detection for unknown systems."""
|
||||
import psutil
|
||||
mem = psutil.virtual_memory()
|
||||
return HardwareInfo(
|
||||
total_memory_gb=mem.total / (1024**3),
|
||||
available_memory_gb=mem.available / (1024**3),
|
||||
cpu_cores=os.cpu_count() or 1,
|
||||
detection_method="psutil",
|
||||
)
|
||||
|
||||
|
||||
# ── KV Cache Memory Estimation ───────────────────────────────────────────────
|
||||
|
||||
def estimate_kv_cache_gb(
|
||||
context_length: int,
|
||||
num_layers: int = 48,
|
||||
num_kv_heads: int = 8,
|
||||
head_dim: int = 128,
|
||||
bits_per_channel: float = 3.5,
|
||||
) -> float:
|
||||
"""Estimate KV cache memory for given parameters.
|
||||
|
||||
Formula: 2 (K+V) × layers × kv_heads × head_dim × context_length × bits/8
|
||||
"""
|
||||
bytes_per_element = bits_per_channel / 8.0
|
||||
total_bytes = 2 * num_layers * num_kv_heads * head_dim * context_length * bytes_per_element
|
||||
return total_bytes / (1024**3)
|
||||
|
||||
|
||||
def estimate_model_memory_gb(model_size_gb: float, quant_type: str = "q4_k_m") -> float:
|
||||
"""Estimate model weights memory. Returns loaded size in GB.
|
||||
|
||||
This is a rough estimate — actual depends on exact quant format.
|
||||
"""
|
||||
# Common quant ratios (vs fp16)
|
||||
quant_multipliers = {
|
||||
"f16": 1.0,
|
||||
"q8_0": 0.5,
|
||||
"q6_k": 0.42,
|
||||
"q5_k_m": 0.37,
|
||||
"q4_k_m": 0.32,
|
||||
"q3_k_m": 0.27,
|
||||
"q2_k": 0.22,
|
||||
}
|
||||
# model_size_gb is already quantized size
|
||||
return model_size_gb
|
||||
|
||||
|
||||
# ── Selection Logic ───────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class QuantSelection:
|
||||
"""Result of quantization level selection."""
|
||||
level: QuantLevel
|
||||
hardware: HardwareInfo
|
||||
reasoning: str
|
||||
total_required_gb: float
|
||||
available_gb: float
|
||||
headroom_gb: float
|
||||
env_vars: dict = field(default_factory=dict)
|
||||
server_flags: dict = field(default_factory=dict)
|
||||
warnings: list = field(default_factory=list)
|
||||
|
||||
|
||||
def select_quant_level(
|
||||
model_size_gb: float = 14.0,
|
||||
context_length: int = 32768,
|
||||
num_layers: int = 48,
|
||||
num_kv_heads: int = 8,
|
||||
head_dim: int = 128,
|
||||
preferred_level: Optional[str] = None,
|
||||
force_cpu: bool = False,
|
||||
) -> QuantSelection:
|
||||
"""Select the best quantization level for available hardware.
|
||||
|
||||
Args:
|
||||
model_size_gb: Size of the model weights in GB
|
||||
context_length: Target context length
|
||||
num_layers: Number of transformer layers
|
||||
num_kv_heads: Number of KV attention heads
|
||||
head_dim: Dimension per attention head
|
||||
preferred_level: Force a specific level (still checks if it fits)
|
||||
force_cpu: If True, ignore GPU memory
|
||||
|
||||
Returns:
|
||||
QuantSelection with the chosen level and reasoning
|
||||
"""
|
||||
hw = detect_hardware()
|
||||
|
||||
if force_cpu:
|
||||
hw.gpu_memory_gb = None
|
||||
hw.gpu_name = None
|
||||
|
||||
# Use the most restrictive memory constraint
|
||||
# For Apple Silicon: unified memory, use total
|
||||
# For NVIDIA: use GPU VRAM
|
||||
# For CPU-only: use system RAM
|
||||
if hw.gpu_memory_gb and hw.gpu_name:
|
||||
memory_pool_gb = hw.gpu_memory_gb
|
||||
memory_label = f"{hw.gpu_name} {hw.gpu_memory_gb:.0f}GB VRAM"
|
||||
elif hw.is_apple_silicon:
|
||||
memory_pool_gb = hw.total_memory_gb
|
||||
memory_label = f"{hw.chip_name or 'Apple Silicon'} {hw.total_memory_gb:.0f}GB unified"
|
||||
else:
|
||||
memory_pool_gb = hw.total_memory_gb
|
||||
memory_label = f"{hw.cpu_cores}c CPU {hw.total_memory_gb:.0f}GB RAM"
|
||||
|
||||
model_mem = estimate_model_memory_gb(model_size_gb)
|
||||
|
||||
# Try levels from best to most compressed
|
||||
chosen = None
|
||||
for level in QUANT_LEVELS:
|
||||
if preferred_level and level.name != preferred_level:
|
||||
continue
|
||||
|
||||
kv_mem = estimate_kv_cache_gb(
|
||||
context_length, num_layers, num_kv_heads, head_dim,
|
||||
level.bits_per_channel
|
||||
)
|
||||
total_required = model_mem + kv_mem
|
||||
headroom = memory_pool_gb - total_required
|
||||
|
||||
if headroom >= level.min_memory_headroom_gb:
|
||||
chosen = level
|
||||
break
|
||||
|
||||
if preferred_level and level.name == preferred_level:
|
||||
# User forced this level but it doesn't fit
|
||||
chosen = level
|
||||
break
|
||||
|
||||
if chosen is None:
|
||||
# Nothing fits — pick the most aggressive compression, not the q4_0 fallback.
|
||||
chosen = max(QUANT_LEVELS, key=lambda level: level.compression_ratio)
|
||||
logger.warning(f"No quant level fits in {memory_pool_gb:.1f}GB. Using {chosen.name}.")
|
||||
|
||||
# Calculate final numbers
|
||||
kv_mem = estimate_kv_cache_gb(
|
||||
context_length, num_layers, num_kv_heads, head_dim,
|
||||
chosen.bits_per_channel
|
||||
)
|
||||
total_required = model_mem + kv_mem
|
||||
headroom = memory_pool_gb - total_required
|
||||
|
||||
# Build reasoning
|
||||
reasoning_parts = [
|
||||
f"{memory_label}:",
|
||||
f"{chosen.name} ({chosen.quality_label}, {chosen.bits_per_channel:.1f}b/ch,",
|
||||
f"{chosen.compression_ratio:.1f}x compression)",
|
||||
f"fits {model_mem:.1f}GB model + {kv_mem:.1f}GB KV cache",
|
||||
f"@ {context_length}K context = {total_required:.1f}GB / {memory_pool_gb:.0f}GB",
|
||||
f"({headroom:.1f}GB headroom)"
|
||||
]
|
||||
reasoning = " ".join(reasoning_parts)
|
||||
|
||||
# Build environment variables for llama.cpp
|
||||
env_vars = {
|
||||
"TURBO_LAYER_ADAPTIVE": str(chosen.layer_adaptive),
|
||||
}
|
||||
|
||||
# Build server flags
|
||||
server_flags = {
|
||||
"-ctk": chosen.kv_type,
|
||||
"-ctv": chosen.kv_type,
|
||||
"-c": str(context_length),
|
||||
}
|
||||
|
||||
# Warnings
|
||||
warnings = []
|
||||
if headroom < 2.0:
|
||||
warnings.append(
|
||||
f"Low headroom ({headroom:.1f}GB). Consider reducing context length or model size."
|
||||
)
|
||||
if headroom < 0:
|
||||
warnings.append(
|
||||
f"OVERCOMMITTED: needs {total_required:.1f}GB but only {memory_pool_gb:.0f}GB available. "
|
||||
f"Inference may fail or swap heavily."
|
||||
)
|
||||
|
||||
selection = QuantSelection(
|
||||
level=chosen,
|
||||
hardware=hw,
|
||||
reasoning=reasoning,
|
||||
total_required_gb=total_required,
|
||||
available_gb=memory_pool_gb,
|
||||
headroom_gb=headroom,
|
||||
env_vars=env_vars,
|
||||
server_flags=server_flags,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
logger.info(f"Quant selection: {reasoning}")
|
||||
for w in warnings:
|
||||
logger.warning(w)
|
||||
|
||||
return selection
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
"""CLI entry point for quant level selection."""
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Auto-select TurboQuant compression level based on available hardware"
|
||||
)
|
||||
parser.add_argument("--model-size", type=float, default=14.0,
|
||||
help="Model size in GB (default: 14.0)")
|
||||
parser.add_argument("--context", type=int, default=32768,
|
||||
help="Target context length (default: 32768)")
|
||||
parser.add_argument("--layers", type=int, default=48,
|
||||
help="Number of transformer layers (default: 48)")
|
||||
parser.add_argument("--kv-heads", type=int, default=8,
|
||||
help="Number of KV attention heads (default: 8)")
|
||||
parser.add_argument("--head-dim", type=int, default=128,
|
||||
help="Dimension per attention head (default: 128)")
|
||||
parser.add_argument("--prefer", type=str, default=None,
|
||||
choices=[l.name for l in QUANT_LEVELS],
|
||||
help="Prefer a specific quant level")
|
||||
parser.add_argument("--force-cpu", action="store_true",
|
||||
help="Ignore GPU, use CPU memory only")
|
||||
parser.add_argument("--json", action="store_true",
|
||||
help="JSON output for automation")
|
||||
parser.add_argument("--detect-only", action="store_true",
|
||||
help="Only detect hardware, don't select")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
|
||||
if args.detect_only:
|
||||
hw = detect_hardware()
|
||||
if args.json:
|
||||
print(json.dumps(hw.__dict__, default=str, indent=2))
|
||||
else:
|
||||
print(f"Total memory: {hw.total_memory_gb:.1f} GB")
|
||||
print(f"Available: {hw.available_memory_gb:.1f} GB")
|
||||
if hw.gpu_memory_gb:
|
||||
print(f"GPU memory: {hw.gpu_memory_gb:.1f} GB")
|
||||
if hw.gpu_name:
|
||||
print(f"GPU: {hw.gpu_name}")
|
||||
if hw.is_apple_silicon:
|
||||
print(f"Chip: {hw.chip_name or 'Apple Silicon'}")
|
||||
print(f"CPU cores: {hw.cpu_cores}")
|
||||
print(f"Detection: {hw.detection_method}")
|
||||
return
|
||||
|
||||
selection = select_quant_level(
|
||||
model_size_gb=args.model_size,
|
||||
context_length=args.context,
|
||||
num_layers=args.layers,
|
||||
num_kv_heads=args.kv_heads,
|
||||
head_dim=args.head_dim,
|
||||
preferred_level=args.prefer,
|
||||
force_cpu=args.force_cpu,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
result = {
|
||||
"level": selection.level.name,
|
||||
"bits_per_channel": selection.level.bits_per_channel,
|
||||
"compression_ratio": selection.level.compression_ratio,
|
||||
"quality": selection.level.quality_label,
|
||||
"reasoning": selection.reasoning,
|
||||
"total_required_gb": round(selection.total_required_gb, 2),
|
||||
"available_gb": round(selection.available_gb, 1),
|
||||
"headroom_gb": round(selection.headroom_gb, 2),
|
||||
"env_vars": selection.env_vars,
|
||||
"server_flags": selection.server_flags,
|
||||
"warnings": selection.warnings,
|
||||
"hardware": {
|
||||
"total_memory_gb": round(selection.hardware.total_memory_gb, 1),
|
||||
"gpu_name": selection.hardware.gpu_name,
|
||||
"is_apple_silicon": selection.hardware.is_apple_silicon,
|
||||
"chip_name": selection.hardware.chip_name,
|
||||
"cpu_cores": selection.hardware.cpu_cores,
|
||||
},
|
||||
}
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
print(f"Selected: {selection.level.name} ({selection.level.quality_label})")
|
||||
print(f" {selection.reasoning}")
|
||||
print()
|
||||
print(f"Environment variables:")
|
||||
for k, v in selection.env_vars.items():
|
||||
print(f" export {k}={v}")
|
||||
print()
|
||||
print(f"Server flags:")
|
||||
for k, v in selection.server_flags.items():
|
||||
print(f" {k} {v}")
|
||||
if selection.warnings:
|
||||
print()
|
||||
for w in selection.warnings:
|
||||
print(f" WARNING: {w}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3
tests/conftest.py
Normal file
3
tests/conftest.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""Pytest configuration for turboquant."""
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
58
tests/test_dflash_apple_silicon.py
Normal file
58
tests/test_dflash_apple_silicon.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for Apple Silicon DFlash benchmark planning helpers (issue #152)."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from unittest.mock import patch
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from benchmarks.dflash_apple_silicon import ( # noqa: E402
|
||||
build_mlx_benchmark_command,
|
||||
detect_total_memory_gb,
|
||||
render_report_template,
|
||||
select_pair,
|
||||
)
|
||||
|
||||
|
||||
class TestPairSelection:
|
||||
def test_prefers_qwen35_9b_on_36gb_mac(self):
|
||||
pair = select_pair(total_memory_gb=36)
|
||||
assert pair.slug == "qwen35-9b"
|
||||
assert pair.base_model == "Qwen/Qwen3.5-9B"
|
||||
assert pair.draft_model == "z-lab/Qwen3.5-9B-DFlash"
|
||||
|
||||
def test_falls_back_to_4b_when_memory_is_tight(self):
|
||||
pair = select_pair(total_memory_gb=20)
|
||||
assert pair.slug == "qwen35-4b"
|
||||
assert pair.base_model == "Qwen/Qwen3.5-4B"
|
||||
|
||||
|
||||
class TestCommandGeneration:
|
||||
def test_builds_upstream_mlx_benchmark_command(self):
|
||||
pair = select_pair(total_memory_gb=36)
|
||||
command = build_mlx_benchmark_command(pair, dataset="gsm8k", max_samples=64)
|
||||
assert "python -m dflash.benchmark --backend mlx" in command
|
||||
assert "--model Qwen/Qwen3.5-9B" in command
|
||||
assert "--draft-model z-lab/Qwen3.5-9B-DFlash" in command
|
||||
assert "--dataset gsm8k" in command
|
||||
assert "--max-samples 64" in command
|
||||
assert "--draft-sliding-window-size 4096" in command
|
||||
|
||||
|
||||
class TestReportTemplate:
|
||||
def test_report_template_mentions_baseline_and_verdict(self):
|
||||
pair = select_pair(total_memory_gb=36)
|
||||
report = render_report_template(machine_label="M3 Max 36GB", pair=pair)
|
||||
assert "DFlash Apple Silicon Benchmark Report" in report
|
||||
assert "M3 Max 36GB" in report
|
||||
assert "Qwen/Qwen3.5-9B" in report
|
||||
assert "plain MLX or llama.cpp speculative decoding" in report
|
||||
assert "Worth operationalizing locally?" in report
|
||||
|
||||
|
||||
class TestMemoryDetection:
|
||||
@patch("benchmarks.dflash_apple_silicon.platform.system", return_value="Darwin")
|
||||
@patch("benchmarks.dflash_apple_silicon.subprocess.check_output", return_value=b"38654705664\n")
|
||||
def test_detect_total_memory_gb_on_macos(self, _mock_sysctl, _mock_system):
|
||||
assert detect_total_memory_gb() == 36.0
|
||||
177
tests/test_quant_selector.py
Normal file
177
tests/test_quant_selector.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for quant_selector.py"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
from evolution.quant_selector import (
|
||||
QuantLevel,
|
||||
HardwareInfo,
|
||||
QUANT_LEVELS,
|
||||
detect_hardware,
|
||||
estimate_kv_cache_gb,
|
||||
estimate_model_memory_gb,
|
||||
select_quant_level,
|
||||
)
|
||||
|
||||
|
||||
class TestQuantLevels:
|
||||
def test_levels_keep_turboquant_quality_order_with_q4_fallback_last(self):
|
||||
"""TurboQuant levels should lead, with q4_0 reserved as the non-Turbo fallback."""
|
||||
names = [level.name for level in QUANT_LEVELS]
|
||||
assert names[:3] == ["turbo4", "turbo3", "turbo2"]
|
||||
assert names[-1] == "q4_0"
|
||||
|
||||
def test_all_levels_have_required_fields(self):
|
||||
for level in QUANT_LEVELS:
|
||||
assert level.name
|
||||
assert level.bits_per_channel > 0
|
||||
assert level.compression_ratio > 1
|
||||
assert level.quality_label
|
||||
assert level.layer_adaptive >= 0
|
||||
assert level.kv_type
|
||||
|
||||
|
||||
class TestKVEstimate:
|
||||
def test_basic_estimate(self):
|
||||
# 48 layers, 8 heads, 128 dim, 32K context, 3.5 bits
|
||||
kv_gb = estimate_kv_cache_gb(32768, 48, 8, 128, 3.5)
|
||||
assert kv_gb > 0
|
||||
assert kv_gb < 10 # Should be reasonable
|
||||
|
||||
def test_longer_context_larger(self):
|
||||
kv_32k = estimate_kv_cache_gb(32768, 48, 8, 128, 3.5)
|
||||
kv_128k = estimate_kv_cache_gb(131072, 48, 8, 128, 3.5)
|
||||
assert kv_128k > kv_32k
|
||||
|
||||
def test_higher_bits_larger(self):
|
||||
kv_4b = estimate_kv_cache_gb(32768, 48, 8, 128, 4.0)
|
||||
kv_2b = estimate_kv_cache_gb(32768, 48, 8, 128, 2.0)
|
||||
assert kv_4b > kv_2b
|
||||
|
||||
|
||||
class TestHardwareDetection:
|
||||
def test_detect_returns_info(self):
|
||||
hw = detect_hardware()
|
||||
assert hw.total_memory_gb > 0
|
||||
assert hw.available_memory_gb > 0
|
||||
assert hw.detection_method
|
||||
|
||||
@patch("evolution.quant_selector.platform.system", return_value="Linux")
|
||||
@patch("builtins.open", create=True)
|
||||
def test_linux_detection(self, mock_open, mock_system):
|
||||
mock_open.return_value.__enter__().read.return_value = (
|
||||
"MemTotal: 32000000 kB\n"
|
||||
"MemAvailable: 24000000 kB\n"
|
||||
)
|
||||
hw = _detect_linux_fallback()
|
||||
assert hw.total_memory_gb > 20
|
||||
|
||||
|
||||
def _detect_linux_fallback():
|
||||
"""Helper to test Linux detection with mocked /proc/meminfo."""
|
||||
from evolution.quant_selector import _detect_linux
|
||||
return _detect_linux()
|
||||
|
||||
|
||||
class TestSelection:
|
||||
def test_selects_turbo4_for_large_memory(self):
|
||||
"""With plenty of memory, should pick turbo4 (best quality)."""
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=64,
|
||||
available_memory_gb=48,
|
||||
gpu_memory_gb=64,
|
||||
gpu_name="Test GPU",
|
||||
cpu_cores=16,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
||||
assert sel.level.name == "turbo4"
|
||||
assert sel.headroom_gb > 0
|
||||
|
||||
def test_selects_smaller_for_tight_memory(self):
|
||||
"""With tight memory, should pick a smaller quant."""
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=16,
|
||||
available_memory_gb=12,
|
||||
gpu_memory_gb=16,
|
||||
gpu_name="Test GPU",
|
||||
cpu_cores=8,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=14.0, context_length=131072)
|
||||
# Should pick a smaller quant for 128K context on 16GB
|
||||
assert sel.level.bits_per_channel <= 4.0
|
||||
|
||||
def test_preferred_level(self):
|
||||
"""User can force a specific level."""
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=64,
|
||||
available_memory_gb=48,
|
||||
cpu_cores=16,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(
|
||||
model_size_gb=14.0, context_length=32768,
|
||||
preferred_level="turbo2"
|
||||
)
|
||||
assert sel.level.name == "turbo2"
|
||||
|
||||
def test_env_vars_populated(self):
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=64,
|
||||
available_memory_gb=48,
|
||||
cpu_cores=16,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
||||
assert "TURBO_LAYER_ADAPTIVE" in sel.env_vars
|
||||
assert "-ctk" in sel.server_flags
|
||||
assert "-ctv" in sel.server_flags
|
||||
|
||||
def test_warnings_on_low_headroom(self):
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=18,
|
||||
available_memory_gb=14,
|
||||
gpu_memory_gb=18,
|
||||
gpu_name="Test GPU",
|
||||
cpu_cores=8,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=16.0, context_length=65536)
|
||||
assert len(sel.warnings) > 0
|
||||
|
||||
def test_falls_back_to_turbo2_when_nothing_fits(self):
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=8,
|
||||
available_memory_gb=6,
|
||||
gpu_memory_gb=8,
|
||||
gpu_name="Tiny GPU",
|
||||
cpu_cores=4,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=16.0, context_length=131072)
|
||||
assert sel.level.name == "turbo2"
|
||||
|
||||
def test_reasoning_contains_key_info(self):
|
||||
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
||||
mock_hw.return_value = HardwareInfo(
|
||||
total_memory_gb=32,
|
||||
available_memory_gb=24,
|
||||
is_apple_silicon=True,
|
||||
chip_name="M4 Max",
|
||||
cpu_cores=16,
|
||||
detection_method="mock",
|
||||
)
|
||||
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
||||
assert "turbo4" in sel.reasoning
|
||||
assert "M4 Max" in sel.reasoning or "32GB" in sel.reasoning
|
||||
338
tests/test_tool_call_integration.py
Normal file
338
tests/test_tool_call_integration.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
Integration test: turboquant compressed model passes hermes tool calls (issue #82).
|
||||
|
||||
Validates that a TurboQuant-compressed model can:
|
||||
1. Parse hermes tool schemas correctly
|
||||
2. Format tool calls in OpenAI-compatible format
|
||||
3. Pass through the hermes agent conversation loop
|
||||
|
||||
Tests are structured as contract tests -- they validate the schema/format
|
||||
compatibility without requiring a running model server. The live inference
|
||||
test is skipped by default (requires llama-server with TurboQuant model).
|
||||
|
||||
Usage:
|
||||
pytest tests/test_tool_call_integration.py -v
|
||||
pytest tests/test_tool_call_integration.py -v -k live # run live test if server available
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
ROOT = pathlib.Path(__file__).resolve().parents[1]
|
||||
PROFILE_PATH = ROOT / "profiles" / "hermes-profile-gemma4-turboquant.yaml"
|
||||
BENCHMARKS_DIR = ROOT / "benchmarks"
|
||||
|
||||
|
||||
class TestHermesProfileSchema(unittest.TestCase):
|
||||
"""Validate the hermes profile YAML has required fields for tool calling."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
import yaml
|
||||
cls.profile = yaml.safe_load(PROFILE_PATH.read_text())
|
||||
|
||||
def test_profile_has_providers(self):
|
||||
assert "providers" in self.profile, "Profile must define providers"
|
||||
assert "primary" in self.profile["providers"], "Must have primary provider"
|
||||
|
||||
def test_primary_provider_has_endpoint(self):
|
||||
primary = self.profile["providers"]["primary"]
|
||||
assert "endpoint" in primary, "Primary provider must have endpoint"
|
||||
assert primary["endpoint"].startswith("http"), "Endpoint must be HTTP(S) URL"
|
||||
|
||||
def test_primary_provider_has_api_path(self):
|
||||
primary = self.profile["providers"]["primary"]
|
||||
assert "api_path" in primary, "Primary provider must have api_path"
|
||||
assert "/chat/completions" in primary["api_path"], (
|
||||
"api_path should be OpenAI-compatible /chat/completions"
|
||||
)
|
||||
|
||||
def test_turboquant_settings_present(self):
|
||||
primary = self.profile["providers"]["primary"]
|
||||
assert "turboquant" in primary, "Must have turboquant config section"
|
||||
tq = primary["turboquant"]
|
||||
assert tq.get("enabled") is True, "TurboQuant must be enabled"
|
||||
assert tq.get("kv_type") in ("turbo2", "turbo3", "turbo4"), (
|
||||
"kv_type must be turbo2, turbo3, or turbo4"
|
||||
)
|
||||
|
||||
def test_context_window_configured(self):
|
||||
primary = self.profile["providers"]["primary"]
|
||||
assert "context" in primary, "Must have context config"
|
||||
ctx = primary["context"]
|
||||
assert ctx.get("max_tokens", 0) >= 8192, (
|
||||
"max_tokens should be >= 8192 for TurboQuant value proposition"
|
||||
)
|
||||
|
||||
|
||||
class TestToolSchemaCompatibility(unittest.TestCase):
|
||||
"""Verify hermes tool schemas serialize to valid JSON for OpenAI tool_calls."""
|
||||
|
||||
SAMPLE_TOOL_SCHEMAS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a text file with line numbers.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string", "description": "File path"},
|
||||
"offset": {"type": "integer", "default": 1},
|
||||
"limit": {"type": "integer", "default": 500},
|
||||
},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "execute_code",
|
||||
"description": "Run a Python script.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"code": {"type": "string", "description": "Python code"},
|
||||
},
|
||||
"required": ["code"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"max_results": {"type": "integer", "default": 5},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
def test_tool_schemas_serialize_to_json(self):
|
||||
"""Tool schemas must serialize without errors."""
|
||||
serialized = json.dumps(self.SAMPLE_TOOL_SCHEMAS)
|
||||
assert len(serialized) > 0
|
||||
parsed = json.loads(serialized)
|
||||
assert len(parsed) == len(self.SAMPLE_TOOL_SCHEMAS)
|
||||
|
||||
def test_tool_schemas_have_required_openai_fields(self):
|
||||
"""Each tool schema must have the fields OpenAI expects."""
|
||||
for tool in self.SAMPLE_TOOL_SCHEMAS:
|
||||
assert tool["type"] == "function", "Tool type must be 'function'"
|
||||
fn = tool["function"]
|
||||
assert "name" in fn, "Function must have name"
|
||||
assert "description" in fn, "Function must have description"
|
||||
assert "parameters" in fn, "Function must have parameters"
|
||||
params = fn["parameters"]
|
||||
assert params["type"] == "object", "Parameters type must be 'object'"
|
||||
assert "properties" in params, "Parameters must have properties"
|
||||
|
||||
def test_tool_call_response_format(self):
|
||||
"""Verify tool_call response matches OpenAI format."""
|
||||
tool_call = {
|
||||
"id": "call_abc123",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"arguments": json.dumps({"path": "/tmp/test.txt"}),
|
||||
},
|
||||
}
|
||||
args = json.loads(tool_call["function"]["arguments"])
|
||||
assert args["path"] == "/tmp/test.txt"
|
||||
assert tool_call["function"]["name"] in [
|
||||
t["function"]["name"] for t in self.SAMPLE_TOOL_SCHEMAS
|
||||
]
|
||||
|
||||
def test_tool_names_are_valid_identifiers(self):
|
||||
"""Tool names must be valid Python identifiers for hermes dispatch."""
|
||||
for tool in self.SAMPLE_TOOL_SCHEMAS:
|
||||
name = tool["function"]["name"]
|
||||
assert re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name), (
|
||||
f"Tool name \'{name}\' is not a valid identifier"
|
||||
)
|
||||
|
||||
|
||||
class TestTurboquantServerConfig(unittest.TestCase):
|
||||
"""Validate server startup configuration matches hermes profile."""
|
||||
|
||||
def test_server_command_has_turboquant_flags(self):
|
||||
"""The server command in the profile must include -ctk/-ctv flags."""
|
||||
profile_text = PROFILE_PATH.read_text()
|
||||
assert "-ctk" in profile_text, "Profile server command must include -ctk flag"
|
||||
assert "-ctv" in profile_text, "Profile server command must include -ctv flag"
|
||||
|
||||
def test_server_command_has_context_flag(self):
|
||||
"""Server command must set context size."""
|
||||
profile_text = PROFILE_PATH.read_text()
|
||||
assert re.search(r"-c\s+\d+", profile_text), (
|
||||
"Server command must include -c <context_size> flag"
|
||||
)
|
||||
|
||||
def test_layer_adaptive_env_var(self):
|
||||
"""Profile must set TURBO_LAYER_ADAPTIVE env var."""
|
||||
profile_text = PROFILE_PATH.read_text()
|
||||
assert "TURBO_LAYER_ADAPTIVE" in profile_text, (
|
||||
"Profile must configure TURBO_LAYER_ADAPTIVE"
|
||||
)
|
||||
|
||||
|
||||
class TestBenchmarkData(unittest.TestCase):
|
||||
"""Validate benchmark test prompts include tool-call test cases."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
prompts_path = BENCHMARKS_DIR / "test_prompts.json"
|
||||
cls.prompts = json.loads(prompts_path.read_text())
|
||||
|
||||
def test_has_tool_call_test_prompt(self):
|
||||
"""Benchmark prompts must include a tool-call format test."""
|
||||
categories = [p.get("category") for p in self.prompts]
|
||||
assert "tool_call_format" in categories, (
|
||||
"Benchmark must include a tool_call_format test case"
|
||||
)
|
||||
|
||||
def test_tool_call_prompt_expects_json(self):
|
||||
"""Tool call test prompt must expect JSON in the response."""
|
||||
tool_prompt = next(
|
||||
p for p in self.prompts if p.get("category") == "tool_call_format"
|
||||
)
|
||||
pattern = tool_prompt.get("expected_pattern", "")
|
||||
assert "json" in pattern.lower() or "\\{" in pattern, (
|
||||
"Tool call prompt must expect JSON-formatted response"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.environ.get("TURBOQUANT_SERVER_URL"),
|
||||
reason="No TurboQuant server available (set TURBOQUANT_SERVER_URL to run)",
|
||||
)
|
||||
class TestLiveToolCallIntegration:
|
||||
"""Live integration test -- requires running llama-server with TurboQuant."""
|
||||
|
||||
def test_server_health(self):
|
||||
"""Server must respond to /v1/models endpoint."""
|
||||
import requests
|
||||
url = os.environ["TURBOQUANT_SERVER_URL"]
|
||||
resp = requests.get(f"{url}/v1/models", timeout=10)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "data" in data
|
||||
assert len(data["data"]) > 0
|
||||
|
||||
def test_tool_call_completion(self):
|
||||
"""Model must return a valid tool_call for a read_file prompt."""
|
||||
import requests
|
||||
url = os.environ["TURBOQUANT_SERVER_URL"]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"path": {"type": "string"}},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
resp = requests.post(
|
||||
f"{url}/v1/chat/completions",
|
||||
json={
|
||||
"model": "gemma-4",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Read the file at /tmp/test.txt"}
|
||||
],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
},
|
||||
timeout=120,
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
choice = data["choices"][0]
|
||||
msg = choice["message"]
|
||||
if "tool_calls" in msg and msg["tool_calls"]:
|
||||
tc = msg["tool_calls"][0]
|
||||
assert tc["type"] == "function"
|
||||
assert tc["function"]["name"] == "read_file"
|
||||
args = json.loads(tc["function"]["arguments"])
|
||||
assert "path" in args
|
||||
else:
|
||||
assert len(msg.get("content", "")) > 0
|
||||
|
||||
def test_tool_call_with_multiple_tools(self):
|
||||
"""Model must handle multiple available tools."""
|
||||
import requests
|
||||
url = os.environ["TURBOQUANT_SERVER_URL"]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"description": "Read a file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"path": {"type": "string"}},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"query": {"type": "string"}},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "execute_code",
|
||||
"description": "Run Python code",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"code": {"type": "string"}},
|
||||
"required": ["code"],
|
||||
},
|
||||
},
|
||||
},
|
||||
]
|
||||
resp = requests.post(
|
||||
f"{url}/v1/chat/completions",
|
||||
json={
|
||||
"model": "gemma-4",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Search the web for 'bitcoin price'"}
|
||||
],
|
||||
"tools": tools,
|
||||
"tool_choice": "auto",
|
||||
},
|
||||
timeout=120,
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "choices" in data
|
||||
assert len(data["choices"]) > 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user