All checks were successful
Smoke Test / smoke (pull_request) Successful in 24s
This commit addresses issue #17 by providing a comprehensive review of the TurboQuant initiative and implementing key improvements. ## Changes ### 1. Initiative Review (docs/INITIATIVE_REVIEW.md) - Comprehensive assessment of current state - Code quality findings and recommendations - Contributor feedback for @manus, @Timmy, @Rockachopa - Implementation plan with clear milestones ### 2. Code Improvements #### llama-turbo.cpp - Added input validation with assertions - Optimized Lloyd-Max search with binary search (O(log n) vs O(n)) - Added stack allocation for d=128 (avoids heap allocation in hot path) - Added error handling for edge cases - Added decision boundaries for efficient quantization #### ggml-metal-turbo.metal - Added bounds checking to all kernels - Added NaN/Inf handling for numerical stability - Completed fused attention kernel (was stub) - Added fused attention with softmax kernel - Added Metal encoding kernel for completeness - Added binary search for quantization ### 3. Testing (tests/test_turbo.cpp) - Unit tests for encode/decode round-trip - Tests for known values (zeros, ones) - Tests for edge cases (large/small values) - Error handling tests ### 4. Build System (CMakeLists.txt) - Added CMake configuration for building library - Added test executable - Added install targets ### 5. Documentation (README.md) - Added build instructions - Added API documentation - Added contributing guidelines - Added code style guide ## Key Improvements 1. **Performance**: Binary search instead of linear search for Lloyd-Max quantization 2. **Memory**: Stack allocation for common case (d=128) 3. **Reliability**: Input validation and error handling 4. **Metal Integration**: Complete fused attention implementation 5. **Testing**: Unit tests for correctness verification 6. **Documentation**: Contributor guidelines and API docs ## Next Steps 1. Run benchmarks to verify performance improvements 2. Test with actual models (qwen3.5:27b) 3. Integrate with llama.cpp fork 4. Deploy to production Closes #17
136 lines
4.2 KiB
C++
136 lines
4.2 KiB
C++
#include "llama-turbo.h"
|
|
#include <cmath>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
#include <iostream>
|
|
#include <cassert>
|
|
#include <cstring>
|
|
|
|
// Lloyd-Max Centroids for N(0, 1/d) where d=128
|
|
// These are precomputed for 4-bit (16 levels)
|
|
static const float turbo4_centroids[16] = {
|
|
-0.2154f, -0.1523f, -0.1121f, -0.0812f,
|
|
-0.0554f, -0.0321f, -0.0105f, 0.0105f,
|
|
0.0321f, 0.0554f, 0.0812f, 0.1121f,
|
|
0.1523f, 0.2154f, 0.2800f, 0.3500f // Approximate tail values
|
|
};
|
|
|
|
// Decision boundaries for binary search (precomputed)
|
|
// boundary[i] = (centroid[i] + centroid[i+1]) / 2
|
|
static const float turbo4_boundaries[15] = {
|
|
-0.18385f, -0.1322f, -0.09665f, -0.0683f,
|
|
-0.04375f, -0.0213f, 0.0f, 0.0213f,
|
|
0.04375f, 0.0683f, 0.09665f, 0.1322f,
|
|
0.18385f, 0.2477f, 0.315f
|
|
};
|
|
|
|
// Fast Walsh-Hadamard Transform (In-place)
|
|
void fwht(float* a, int n) {
|
|
assert(n > 0 && (n & (n - 1)) == 0 && "n must be power of 2");
|
|
|
|
for (int h = 1; h < n; h <<= 1) {
|
|
for (int i = 0; i < n; i += (h << 1)) {
|
|
for (int j = i; j < i + h; j++) {
|
|
float x = a[j];
|
|
float y = a[j + h];
|
|
a[j] = x + y;
|
|
a[j + h] = x - y;
|
|
}
|
|
}
|
|
}
|
|
// Normalize
|
|
float scale = 1.0f / sqrtf((float)n);
|
|
for (int i = 0; i < n; i++) {
|
|
a[i] *= scale;
|
|
}
|
|
}
|
|
|
|
// Binary search for Lloyd-Max quantization
|
|
static inline int quantize_turbo4(float val) {
|
|
// Binary search through decision boundaries
|
|
int left = 0, right = 14;
|
|
while (left < right) {
|
|
int mid = (left + right) / 2;
|
|
if (val < turbo4_boundaries[mid]) {
|
|
right = mid;
|
|
} else {
|
|
left = mid + 1;
|
|
}
|
|
}
|
|
return left;
|
|
}
|
|
|
|
// PolarQuant Encode (CPU Reference)
|
|
void polar_quant_encode_turbo4(const float* src, uint8_t* dst, float* norm, int d) {
|
|
assert(src != nullptr && "src cannot be null");
|
|
assert(dst != nullptr && "dst cannot be null");
|
|
assert(norm != nullptr && "norm cannot be null");
|
|
assert(d > 0 && (d & (d - 1)) == 0 && "d must be power of 2");
|
|
|
|
// Use stack allocation for small d (d=128 is 512 bytes)
|
|
float rotated[128]; // Stack allocation for d=128
|
|
if (d > 128) {
|
|
// Fallback to heap for larger d
|
|
std::vector<float> rotated_vec(src, src + d);
|
|
fwht(rotated_vec.data(), d);
|
|
|
|
// Calculate L2 Norm (Radius)
|
|
float sum_sq = 0;
|
|
for (int i = 0; i < d; i++) sum_sq += rotated_vec[i] * rotated_vec[i];
|
|
*norm = sqrtf(sum_sq);
|
|
|
|
// Quantize components
|
|
float inv_norm = 1.0f / (*norm + 1e-9f);
|
|
for (int i = 0; i < d; i++) {
|
|
float val = rotated_vec[i] * inv_norm;
|
|
int best_idx = quantize_turbo4(val);
|
|
|
|
// Pack 4-bit indices
|
|
if (i % 2 == 0) {
|
|
dst[i / 2] = (uint8_t)best_idx;
|
|
} else {
|
|
dst[i / 2] |= (uint8_t)(best_idx << 4);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Stack-allocated path for d=128
|
|
memcpy(rotated, src, d * sizeof(float));
|
|
fwht(rotated, d);
|
|
|
|
// Calculate L2 Norm (Radius)
|
|
float sum_sq = 0;
|
|
for (int i = 0; i < d; i++) sum_sq += rotated[i] * rotated[i];
|
|
*norm = sqrtf(sum_sq);
|
|
|
|
// Quantize components
|
|
float inv_norm = 1.0f / (*norm + 1e-9f);
|
|
for (int i = 0; i < d; i++) {
|
|
float val = rotated[i] * inv_norm;
|
|
int best_idx = quantize_turbo4(val);
|
|
|
|
// Pack 4-bit indices
|
|
if (i % 2 == 0) {
|
|
dst[i / 2] = (uint8_t)best_idx;
|
|
} else {
|
|
dst[i / 2] |= (uint8_t)(best_idx << 4);
|
|
}
|
|
}
|
|
}
|
|
|
|
// PolarQuant Decode (CPU Reference)
|
|
void polar_quant_decode_turbo4(const uint8_t* src, float* dst, float norm, int d) {
|
|
assert(src != nullptr && "src cannot be null");
|
|
assert(dst != nullptr && "dst cannot be null");
|
|
assert(d > 0 && (d & (d - 1)) == 0 && "d must be power of 2");
|
|
|
|
for (int i = 0; i < d; i++) {
|
|
int idx = (i % 2 == 0) ? (src[i / 2] & 0x0F) : (src[i / 2] >> 4);
|
|
assert(idx >= 0 && idx < 16 && "Invalid index");
|
|
dst[i] = turbo4_centroids[idx] * norm;
|
|
}
|
|
// Inverse WHT is same as Forward WHT for orthogonal matrices
|
|
fwht(dst, d);
|
|
}
|