diff --git a/GENOME.md b/GENOME.md new file mode 100644 index 0000000..933bc55 --- /dev/null +++ b/GENOME.md @@ -0,0 +1,323 @@ +# GENOME.md — TurboQuant + +*Generated: 2026-04-14 | Codebase Genome Analysis* + +## Project Overview + +**TurboQuant** is a KV cache compression system for local inference on Apple Silicon. It implements Google's TurboQuant algorithm (ICLR 2026) to achieve ~73% memory savings with minimal quality loss. + +### Core Value Proposition +- **Problem**: Large language models (27B+) require massive KV cache memory at long contexts +- **Solution**: Three-stage compression (PolarQuant + QJL) reduces KV cache to ~3.5 bits/channel +- **Result**: 128K context on 36GB hardware becomes viable (vs impossible at FP16) + +### Key Metrics +- **Compression**: 73.4% KV memory savings (turbo4 vs f16) +- **Quality**: ~1% prompt overhead, ~11% generation overhead +- **Target**: qwen3.5:27b at 128K context within 36GB unified memory + +## Architecture + +```mermaid +graph TB + subgraph "Input Layer" + Q[Query Vector Q] + K[Key Vector K] + V[Value Vector V] + end + + subgraph "TurboQuant Compression" + WHT[Walsh-Hadamard Transform] + PQ[PolarQuant Encode] + QJL[QJL Residual] + PACK[Bit Packing] + end + + subgraph "KV Cache Storage" + CACHE[Compressed KV Cache] + NORMS[Radius Norms FP16] + end + + subgraph "Decompression & Attention" + UNPACK[Bit Unpack] + DEQ[PolarQuant Decode] + FWHT[Inverse WHT] + ATTEN[Attention Compute] + end + + subgraph "Output" + SCORES[Attention Scores] + OUT[Weighted Values] + end + + K --> WHT + WHT --> PQ + PQ --> PACK + PACK --> CACHE + PQ --> NORMS + + V --> WHT + WHT --> PQ + PQ --> PACK + PACK --> CACHE + + CACHE --> UNPACK + NORMS --> DEQ + UNPACK --> DEQ + DEQ --> FWHT + + Q --> ATTEN + FWHT --> ATTEN + ATTEN --> SCORES + SCORES --> OUT + + style WHT fill:#e1f5fe + style PQ fill:#fff3e0 + style QJL fill:#f3e5f5 + style ATTEN fill:#e8f5e8 +``` + +## Entry Points + +### Primary Entry: Metal Shaders +- **File**: `ggml-metal-turbo.metal` +- **Functions**: + - `kernel_fwht_128`: Walsh-Hadamard transform (GPU) + - `kernel_turbo4_dequant`: 4-bit dequantization (hot path) + - `kernel_attention_turbo4`: Fused attention (conceptual) + +### CPU Reference Implementation +- **File**: `llama-turbo.cpp` +- **Functions**: + - `polar_quant_encode_turbo4`: Encode (CPU reference) + - `polar_quant_decode_turbo4`: Decode (CPU reference) + - `fwht`: Fast Walsh-Hadamard transform + +### Benchmarking +- **File**: `benchmarks/run_benchmarks.py` +- **Entry**: CLI tool for measuring TTFT, tokens/sec, memory +- **Backends**: Ollama, llama-server + +### Configuration +- **File**: `profiles/hermes-profile-gemma4-turboquant.yaml` +- **Purpose**: Hermes agent profile for TurboQuant deployment + +## Data Flow + +``` +1. Model Load + ├── Load GGUF model weights + ├── Initialize Lloyd-Max codebook (16 centroids for turbo4) + ├── Initialize WHT rotation matrix (128×128) + └── Set per-layer adaptive mode (TURBO_LAYER_ADAPTIVE) + +2. Forward Pass (per token) + ├── Compute Q, K, V projections + ├── Compress K, V via PolarQuant: + │ ├── Apply WHT rotation (O(d log d)) + │ ├── Compute L2 norm (radius) + │ ├── Quantize coordinates to 4-bit indices + │ └── Pack indices + store radius + ├── Store compressed K, V in cache + └── Attention: + ├── Decompress K from cache (hot path) + ├── Compute Q·K^T scores + ├── Apply softmax + ├── Decompress V from cache + └── Compute weighted sum + +3. Generation + ├── Append new token to sequence + ├── Extend KV cache with compressed K, V + └── Continue forward pass +``` + +## Key Abstractions + +### 1. PolarQuant Codec +- **Purpose**: Compress/decompress KV vectors +- **Algorithm**: WHT → polar coordinates → Lloyd-Max quantization +- **Interface**: `polar_quant_encode_turbo4()` / `polar_quant_decode_turbo4()` + +### 2. Walsh-Hadamard Transform +- **Purpose**: Energy-spreading rotation (makes distribution predictable) +- **Property**: Orthogonal (preserves inner products) +- **Complexity**: O(d log d) vs O(d²) for dense rotation + +### 3. Lloyd-Max Codebook +- **Purpose**: Optimal scalar quantization for known distribution +- **Size**: 16 entries for turbo4 (4-bit) +- **Key**: Precomputed, fixed (no per-vector calibration) + +### 4. Per-Layer Adaptive Quantization +- **Purpose**: Protect sensitive layers (first/last) with higher precision +- **Modes**: 7 modes (0=uniform, 7=recommended) +- **Mechanism**: `TURBO_LAYER_ADAPTIVE` environment variable + +## API Surface + +### C API (llama-turbo.h) +```c +// Encode: float → 4-bit packed +void polar_quant_encode_turbo4( + const float* src, // Input [d] + uint8_t* dst, // Output [d/2] packed 4-bit + float* norm, // Output L2 norm + int d // Dimension (must be power of 2) +); + +// Decode: 4-bit packed → float +void polar_quant_decode_turbo4( + const uint8_t* src, // Input [d/2] packed 4-bit + float* dst, // Output [d] + float norm, // Input L2 norm + int d // Dimension +); +``` + +### Metal Shaders (GPU) +```metal +// Walsh-Hadamard transform (in-place) +kernel void kernel_fwht_128( + device float* data [[buffer(0)]], + uint tid [[thread_position_in_grid]] +); + +// 4-bit dequantization (hot path) +kernel void kernel_turbo4_dequant( + device const uchar* src [[buffer(0)]], + device const float* norms [[buffer(1)]], + device float* dst [[buffer(2)]], + uint tid [[thread_position_in_grid]] +); +``` + +### llama-server CLI +```bash +llama-server \ + -m model.gguf \ + -ctk turbo4 -ctv turbo4 \ # KV cache type + -c 131072 \ # Context length + --port 11434 # API port +``` + +### Environment Variables +- `TURBO_LAYER_ADAPTIVE`: Per-layer quantization mode (0-7) +- `TURBO4_USE_4BIT`: Enable 4-bit mode (default: 1) + +## Test Coverage Gaps + +### Current State +- **Unit tests**: ❌ None in this repo +- **Integration tests**: ❌ None +- **Benchmark tests**: ✅ `benchmarks/run_benchmarks.py` +- **Perplexity tests**: ⚠️ Corpus exists (`corpora/wiki.test.raw`) but no runner + +### Critical Missing Tests +1. **Encode/Decode Roundtrip**: Verify `decode(encode(x)) ≈ x` +2. **Inner Product Preservation**: Verify `Q·K ≈ Q·dequant(quant(K))` +3. **WHT Orthogonality**: Verify `WHT^T · WHT = I` +4. **Codebook Correctness**: Verify centroids match Lloyd-Max for N(0, 1/128) +5. **Metal vs CPU Parity**: Verify GPU and CPU produce identical results +6. **Per-Layer Adaptive**: Verify sensitive layers use higher precision +7. **Memory Bounds**: Verify no buffer overflows in bit packing + +### Recommended Test Suite +```python +# tests/test_polar_quant.py +def test_roundtrip(): + """Encode then decode should recover original within tolerance.""" + +def test_inner_product_preservation(): + """Q·K dot product should be preserved through compression.""" + +def test_wht_orthogonality(): + """WHT matrix should be orthogonal.""" + +def test_codebook_optimality(): + """Centroids should minimize MSE for N(0, 1/128).""" +``` + +## Security Considerations + +### 1. Buffer Overflows +- **Risk**: Bit packing/unpacking could overflow if dimension not power of 2 +- **Mitigation**: Static asserts in Metal shaders, runtime checks in CPU code +- **Status**: ⚠️ Need verification + +### 2. Numerical Stability +- **Risk**: Division by zero in `1.0 / (norm + 1e-9)` +- **Mitigation**: Epsilon guard present +- **Status**: ✅ Handled + +### 3. Memory Safety +- **Risk**: C/C++ code has no bounds checking +- **Mitigation**: Use Rust wrapper or sanitize inputs +- **Status**: ⚠️ No safety wrapper + +### 4. Denial of Service +- **Risk**: Maliciously crafted KV vectors could cause slow quantization +- **Mitigation**: Fixed iteration count in Lloyd-Max search +- **Status**: ✅ Bounded + +### 5. Side Channels +- **Risk**: Timing differences in quantization could leak information +- **Mitigation**: Constant-time implementation needed +- **Status**: ❌ Not implemented + +## Dependencies + +### Build Dependencies +- **CMake**: Build system +- **Metal SDK**: GPU shaders (macOS) +- **C++17**: Language standard + +### Runtime Dependencies +- **Apple Silicon**: M1/M2/M3/M4 +- **macOS**: Metal GPU support +- **llama.cpp**: Inference engine (forked) + +### External References +- [TheTom/llama-cpp-turboquant](https://github.com/TheTom/llama-cpp-turboquant) — Primary fork +- [TheTom/turboquant_plus](https://github.com/TheTom/turboquant_plus) — Reference implementation +- [amirzandieh/QJL](https://github.com/amirzandieh/QJL) — QJL author's code +- [rachittshah/mlx-turboquant](https://github.com/rachittshah/mlx-turboquant) — MLX fallback + +## Deployment + +### Build +```bash +cd llama-cpp-turboquant +git checkout feature/turboquant-kv-cache +cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release +cmake --build build -j$(sysctl -n hw.ncpu) +``` + +### Run +```bash +export TURBO_LAYER_ADAPTIVE=7 +./build/bin/llama-server \ + -m /path/to/model.gguf \ + --port 11434 \ + -ctk turbo4 -ctv turbo4 \ + -c 131072 +``` + +### Validate +```bash +curl http://localhost:11434/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"qwen3.5","messages":[{"role":"user","content":"hello"}]}' +``` + +## Open Questions + +1. **QJL Status**: Infrastructure exists but is disabled. When will it be needed? +2. **Upstream Landing**: When will TurboQuant be merged into llama.cpp mainline? +3. **Quality Threshold**: What PPL delta is acceptable for production use? +4. **Multi-GPU**: Does TurboQuant work with tensor parallelism? + +## Changelog + +- **2026-03-30**: Phase 1 complete. PolarQuant MVP verified. 73% KV savings confirmed. +- **2026-04-14**: GENOME.md generated. Test gaps identified. Security considerations documented. diff --git a/tests/__pycache__/test_turboquant.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_turboquant.cpython-312-pytest-9.0.2.pyc new file mode 100644 index 0000000..10ac0c3 Binary files /dev/null and b/tests/__pycache__/test_turboquant.cpython-312-pytest-9.0.2.pyc differ diff --git a/tests/test_turboquant.py b/tests/test_turboquant.py new file mode 100644 index 0000000..6108f45 --- /dev/null +++ b/tests/test_turboquant.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +TurboQuant Test Suite +Tests for critical paths in KV cache compression. + +Issue #679: Codebase Genome: turboquant — Full Analysis +""" +import unittest +import subprocess +import json +import os +import sys + +class TestTurboQuant(unittest.TestCase): + """Test TurboQuant implementation.""" + + def test_repo_structure(self): + """Verify expected files exist.""" + required_files = [ + "llama-turbo.h", + "llama-turbo.cpp", + "ggml-metal-turbo.metal", + "README.md", + "GENOME.md" + ] + + for filename in required_files: + filepath = os.path.join(os.path.dirname(__file__), "..", filename) + self.assertTrue(os.path.exists(filepath), f"Missing required file: {filename}") + + def test_benchmarks_exist(self): + """Verify benchmark scripts exist.""" + benchmark_files = [ + "benchmarks/run_benchmarks.py", + "benchmarks/run_perplexity.py", + "benchmarks/run_long_session.py" + ] + + for filename in benchmark_files: + filepath = os.path.join(os.path.dirname(__file__), "..", filename) + self.assertTrue(os.path.exists(filepath), f"Missing benchmark file: {filename}") + + def test_docs_complete(self): + """Verify documentation exists.""" + doc_files = [ + "docs/PROJECT_STATUS.md", + "profiles/README.md" + ] + + for filename in doc_files: + filepath = os.path.join(os.path.dirname(__file__), "..", filename) + self.assertTrue(os.path.exists(filepath), f"Missing doc file: {filename}") + + def test_genome_generated(self): + """Verify GENOME.md was generated.""" + genome_path = os.path.join(os.path.dirname(__file__), "..", "GENOME.md") + self.assertTrue(os.path.exists(genome_path), "GENOME.md not found") + + # Check it has required sections + with open(genome_path, 'r') as f: + content = f.read() + + required_sections = [ + "## Project Overview", + "## Architecture", + "## Entry Points", + "## Data Flow", + "## Key Abstractions", + "## API Surface", + "## Test Coverage Gaps", + "## Security Considerations" + ] + + for section in required_sections: + self.assertIn(section, content, f"GENOME.md missing section: {section}") + + def test_metal_shader_syntax(self): + """Basic syntax check for Metal shader.""" + shader_path = os.path.join(os.path.dirname(__file__), "..", "ggml-metal-turbo.metal") + with open(shader_path, 'r') as f: + content = f.read() + + # Check for key functions + self.assertIn("kernel_fwht_128", content, "Missing kernel_fwht_128 function") + self.assertIn("kernel_turbo4_dequant", content, "Missing kernel_turbo4_dequant function") + self.assertIn("turbo4_centroids", content, "Missing turbo4_centroids array") + + def test_cpp_header(self): + """Verify C++ header has correct declarations.""" + header_path = os.path.join(os.path.dirname(__file__), "..", "llama-turbo.h") + with open(header_path, 'r') as f: + content = f.read() + + # Check for function declarations + self.assertIn("polar_quant_encode_turbo4", content, "Missing encode function") + self.assertIn("polar_quant_decode_turbo4", content, "Missing decode function") + self.assertIn('extern "C"', content, "Missing C linkage") + +class TestBenchmarks(unittest.TestCase): + """Test benchmark infrastructure.""" + + def test_benchmark_imports(self): + """Verify benchmark script can be imported.""" + benchmark_path = os.path.join(os.path.dirname(__file__), "..", "benchmarks", "run_benchmarks.py") + + # Check file exists + self.assertTrue(os.path.exists(benchmark_path), "Benchmark script not found") + + # Check it has main function + with open(benchmark_path, 'r') as f: + content = f.read() + + self.assertIn("def main():", content, "Benchmark script missing main function") + self.assertIn("argparse", content, "Benchmark script missing argparse") + +class TestDocumentation(unittest.TestCase): + """Test documentation completeness.""" + + def test_readme_sections(self): + """Verify README has required sections.""" + readme_path = os.path.join(os.path.dirname(__file__), "..", "README.md") + with open(readme_path, 'r') as f: + content = f.read() + + required_sections = ["## What", "## Why", "## Status", "## Roles"] + for section in required_sections: + self.assertIn(section, content, f"README missing section: {section}") + + def test_project_status_sections(self): + """Verify PROJECT_STATUS.md has required sections.""" + status_path = os.path.join(os.path.dirname(__file__), "..", "docs", "PROJECT_STATUS.md") + with open(status_path, 'r') as f: + content = f.read() + + # Check for key findings + self.assertIn("73%", content, "Missing 73% savings metric") + self.assertIn("PolarQuant", content, "Missing PolarQuant references") + self.assertIn("Metal", content, "Missing Metal shader references") + +if __name__ == "__main__": + unittest.main()