Files
turboquant/llama-turbo.cpp
Alexander Whitestone d2ef914edd
All checks were successful
Smoke Test / smoke (pull_request) Successful in 24s
feat: Comprehensive review and improvements for TurboQuant (#17)
This commit addresses issue #17 by providing a comprehensive review
of the TurboQuant initiative and implementing key improvements.

## Changes

### 1. Initiative Review (docs/INITIATIVE_REVIEW.md)
- Comprehensive assessment of current state
- Code quality findings and recommendations
- Contributor feedback for @manus, @Timmy, @Rockachopa
- Implementation plan with clear milestones

### 2. Code Improvements

#### llama-turbo.cpp
- Added input validation with assertions
- Optimized Lloyd-Max search with binary search (O(log n) vs O(n))
- Added stack allocation for d=128 (avoids heap allocation in hot path)
- Added error handling for edge cases
- Added decision boundaries for efficient quantization

#### ggml-metal-turbo.metal
- Added bounds checking to all kernels
- Added NaN/Inf handling for numerical stability
- Completed fused attention kernel (was stub)
- Added fused attention with softmax kernel
- Added Metal encoding kernel for completeness
- Added binary search for quantization

### 3. Testing (tests/test_turbo.cpp)
- Unit tests for encode/decode round-trip
- Tests for known values (zeros, ones)
- Tests for edge cases (large/small values)
- Error handling tests

### 4. Build System (CMakeLists.txt)
- Added CMake configuration for building library
- Added test executable
- Added install targets

### 5. Documentation (README.md)
- Added build instructions
- Added API documentation
- Added contributing guidelines
- Added code style guide

## Key Improvements

1. **Performance**: Binary search instead of linear search for Lloyd-Max quantization
2. **Memory**: Stack allocation for common case (d=128)
3. **Reliability**: Input validation and error handling
4. **Metal Integration**: Complete fused attention implementation
5. **Testing**: Unit tests for correctness verification
6. **Documentation**: Contributor guidelines and API docs

## Next Steps

1. Run benchmarks to verify performance improvements
2. Test with actual models (qwen3.5:27b)
3. Integrate with llama.cpp fork
4. Deploy to production

Closes #17
2026-04-14 22:07:21 -04:00

136 lines
4.2 KiB
C++

#include "llama-turbo.h"
#include <cmath>
#include <vector>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <cstring>
// Lloyd-Max Centroids for N(0, 1/d) where d=128
// These are precomputed for 4-bit (16 levels)
static const float turbo4_centroids[16] = {
-0.2154f, -0.1523f, -0.1121f, -0.0812f,
-0.0554f, -0.0321f, -0.0105f, 0.0105f,
0.0321f, 0.0554f, 0.0812f, 0.1121f,
0.1523f, 0.2154f, 0.2800f, 0.3500f // Approximate tail values
};
// Decision boundaries for binary search (precomputed)
// boundary[i] = (centroid[i] + centroid[i+1]) / 2
static const float turbo4_boundaries[15] = {
-0.18385f, -0.1322f, -0.09665f, -0.0683f,
-0.04375f, -0.0213f, 0.0f, 0.0213f,
0.04375f, 0.0683f, 0.09665f, 0.1322f,
0.18385f, 0.2477f, 0.315f
};
// Fast Walsh-Hadamard Transform (In-place)
void fwht(float* a, int n) {
assert(n > 0 && (n & (n - 1)) == 0 && "n must be power of 2");
for (int h = 1; h < n; h <<= 1) {
for (int i = 0; i < n; i += (h << 1)) {
for (int j = i; j < i + h; j++) {
float x = a[j];
float y = a[j + h];
a[j] = x + y;
a[j + h] = x - y;
}
}
}
// Normalize
float scale = 1.0f / sqrtf((float)n);
for (int i = 0; i < n; i++) {
a[i] *= scale;
}
}
// Binary search for Lloyd-Max quantization
static inline int quantize_turbo4(float val) {
// Binary search through decision boundaries
int left = 0, right = 14;
while (left < right) {
int mid = (left + right) / 2;
if (val < turbo4_boundaries[mid]) {
right = mid;
} else {
left = mid + 1;
}
}
return left;
}
// PolarQuant Encode (CPU Reference)
void polar_quant_encode_turbo4(const float* src, uint8_t* dst, float* norm, int d) {
assert(src != nullptr && "src cannot be null");
assert(dst != nullptr && "dst cannot be null");
assert(norm != nullptr && "norm cannot be null");
assert(d > 0 && (d & (d - 1)) == 0 && "d must be power of 2");
// Use stack allocation for small d (d=128 is 512 bytes)
float rotated[128]; // Stack allocation for d=128
if (d > 128) {
// Fallback to heap for larger d
std::vector<float> rotated_vec(src, src + d);
fwht(rotated_vec.data(), d);
// Calculate L2 Norm (Radius)
float sum_sq = 0;
for (int i = 0; i < d; i++) sum_sq += rotated_vec[i] * rotated_vec[i];
*norm = sqrtf(sum_sq);
// Quantize components
float inv_norm = 1.0f / (*norm + 1e-9f);
for (int i = 0; i < d; i++) {
float val = rotated_vec[i] * inv_norm;
int best_idx = quantize_turbo4(val);
// Pack 4-bit indices
if (i % 2 == 0) {
dst[i / 2] = (uint8_t)best_idx;
} else {
dst[i / 2] |= (uint8_t)(best_idx << 4);
}
}
return;
}
// Stack-allocated path for d=128
memcpy(rotated, src, d * sizeof(float));
fwht(rotated, d);
// Calculate L2 Norm (Radius)
float sum_sq = 0;
for (int i = 0; i < d; i++) sum_sq += rotated[i] * rotated[i];
*norm = sqrtf(sum_sq);
// Quantize components
float inv_norm = 1.0f / (*norm + 1e-9f);
for (int i = 0; i < d; i++) {
float val = rotated[i] * inv_norm;
int best_idx = quantize_turbo4(val);
// Pack 4-bit indices
if (i % 2 == 0) {
dst[i / 2] = (uint8_t)best_idx;
} else {
dst[i / 2] |= (uint8_t)(best_idx << 4);
}
}
}
// PolarQuant Decode (CPU Reference)
void polar_quant_decode_turbo4(const uint8_t* src, float* dst, float norm, int d) {
assert(src != nullptr && "src cannot be null");
assert(dst != nullptr && "dst cannot be null");
assert(d > 0 && (d & (d - 1)) == 0 && "d must be power of 2");
for (int i = 0; i < d; i++) {
int idx = (i % 2 == 0) ? (src[i / 2] & 0x0F) : (src[i / 2] >> 4);
assert(idx >= 0 && idx < 16 && "Invalid index");
dst[i] = turbo4_centroids[idx] * norm;
}
// Inverse WHT is same as Forward WHT for orthogonal matrices
fwht(dst, d);
}