Compare commits
15 Commits
fix/74-git
...
burn/98-17
| Author | SHA1 | Date | |
|---|---|---|---|
| bf68627ea1 | |||
| dbbed6790f | |||
| 3e304db72a | |||
| 0d427b69d3 | |||
| f1a699a3fc | |||
| b9f2da9e19 | |||
| 5e7c637bbf | |||
| e46c2b6155 | |||
| 8e9afb34fa | |||
| 713db4962e | |||
| 3cd8750cbb | |||
| ef765bbd30 | |||
|
|
5f0d00f127 | ||
|
|
8affe79489 | ||
|
|
319f57780d |
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
build/
|
||||
*.pyc
|
||||
__pycache__/
|
||||
36
CMakeLists.txt
Normal file
36
CMakeLists.txt
Normal file
@@ -0,0 +1,36 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
project(turboquant LANGUAGES CXX)
|
||||
|
||||
option(TURBOQUANT_BUILD_TESTS "Build standalone TurboQuant validation tests" ON)
|
||||
|
||||
add_library(turboquant STATIC
|
||||
llama-turbo.cpp
|
||||
)
|
||||
|
||||
target_include_directories(turboquant PUBLIC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
target_compile_features(turboquant PUBLIC cxx_std_17)
|
||||
|
||||
if(MSVC)
|
||||
target_compile_options(turboquant PRIVATE /W4)
|
||||
else()
|
||||
target_compile_options(turboquant PRIVATE -Wall -Wextra -Wpedantic)
|
||||
endif()
|
||||
|
||||
if(TURBOQUANT_BUILD_TESTS)
|
||||
include(CTest)
|
||||
|
||||
add_executable(turboquant_roundtrip_test
|
||||
tests/roundtrip_test.cpp
|
||||
)
|
||||
target_link_libraries(turboquant_roundtrip_test PRIVATE turboquant)
|
||||
target_compile_features(turboquant_roundtrip_test PRIVATE cxx_std_17)
|
||||
|
||||
add_test(
|
||||
NAME turboquant_roundtrip
|
||||
COMMAND turboquant_roundtrip_test
|
||||
)
|
||||
endif()
|
||||
@@ -13,7 +13,7 @@ Unlock 64K-128K context on qwen3.5:27b within 32GB unified memory.
|
||||
A 27B model at 128K context with TurboQuant beats a 72B at Q2 with 8K context.
|
||||
|
||||
## Status
|
||||
See [issues](http://143.198.27.163:3000/Timmy_Foundation/turboquant/issues) for current progress.
|
||||
See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/issues) for current progress.
|
||||
|
||||
## Roles
|
||||
- **Strago:** Build spec author
|
||||
@@ -29,4 +29,4 @@ See [issues](http://143.198.27.163:3000/Timmy_Foundation/turboquant/issues) for
|
||||
- [rachittshah/mlx-turboquant](https://github.com/rachittshah/mlx-turboquant) — MLX fallback
|
||||
|
||||
## Docs
|
||||
- [BUILD-SPEC.md](BUILD-SPEC.md) — Full build specification (Strago, v2.2)
|
||||
- [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification
|
||||
|
||||
47
ansible/README.md
Normal file
47
ansible/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# TurboQuant Ansible Deployment
|
||||
|
||||
Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Copy and edit inventory
|
||||
cp inventory.ini.example inventory.ini
|
||||
vim inventory.ini
|
||||
|
||||
# 2. Deploy to all nodes
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml
|
||||
|
||||
# 3. Deploy without integration tests
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
|
||||
|
||||
# 4. Deploy to specific node
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
|
||||
```
|
||||
|
||||
## Deployment Matrix
|
||||
|
||||
| Node | Hardware | Model | Preset |
|
||||
|------|----------|-------|--------|
|
||||
| Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
|
||||
| Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
|
||||
|
||||
## Health Check
|
||||
|
||||
```bash
|
||||
# Check local node
|
||||
./health_check.sh localhost 8081
|
||||
|
||||
# Check remote node
|
||||
./health_check.sh 192.168.1.100 8081
|
||||
```
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
|
||||
|
||||
Key variables:
|
||||
- `llama_cpp_port`: Server port (default: 8081)
|
||||
- `turboquant_kv_type`: KV cache compression type (default: turbo4)
|
||||
- `max_context_tokens`: Maximum context length (default: 131072)
|
||||
- `gemma4_model_filename`: Model filename per node
|
||||
19
ansible/deploy_turboquant.yml
Normal file
19
ansible/deploy_turboquant.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# Deploy TurboQuant-compressed Gemma 4 across fleet nodes
|
||||
# Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
|
||||
|
||||
- name: Deploy TurboQuant Gemma 4
|
||||
hosts: turboquant_fleet
|
||||
become: yes
|
||||
vars:
|
||||
turboquant_version: "main"
|
||||
model_base_path: "/opt/models"
|
||||
llama_cpp_port: 8081
|
||||
|
||||
roles:
|
||||
- turboquant-deploy
|
||||
|
||||
post_tasks:
|
||||
- name: Run integration tests
|
||||
include_tasks: tasks/integration_test.yml
|
||||
when: run_integration_tests | default(true)
|
||||
38
ansible/health_check.sh
Normal file
38
ansible/health_check.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
# TurboQuant Health Check Script
|
||||
# Usage: ./health_check.sh [host] [port]
|
||||
|
||||
HOST=${1:-localhost}
|
||||
PORT=${2:-8081}
|
||||
TIMEOUT=5
|
||||
|
||||
echo "=== TurboQuant Health Check ==="
|
||||
echo "Host: $HOST:$PORT"
|
||||
|
||||
# Check if server is responding
|
||||
if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
|
||||
echo "ERROR: Server not responding at $HOST:$PORT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get model info
|
||||
MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
|
||||
if [ -z "$MODELS" ]; then
|
||||
echo "WARNING: No models loaded"
|
||||
else
|
||||
echo "Models loaded: $MODELS"
|
||||
fi
|
||||
|
||||
# Test inference
|
||||
RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions" -H "Content-Type: application/json" -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
|
||||
|
||||
if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
|
||||
echo "Inference: OK"
|
||||
echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
|
||||
else
|
||||
echo "ERROR: Inference failed"
|
||||
echo "Response: $RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Health Check Passed ==="
|
||||
33
ansible/inventory.ini.example
Normal file
33
ansible/inventory.ini.example
Normal file
@@ -0,0 +1,33 @@
|
||||
# TurboQuant Fleet Inventory
|
||||
# Copy to inventory.ini and update with actual hosts
|
||||
|
||||
[turboquant_fleet]
|
||||
# Mac (Timmy) - M1, 16GB
|
||||
timmy ansible_host=192.168.1.100 ansible_user=apayne
|
||||
|
||||
# Allegro VPS - 2 cores, 8GB
|
||||
allegro ansible_host=167.99.126.228 ansible_user=root
|
||||
|
||||
# Ezra VPS - TBD
|
||||
# ezra ansible_host=TBD ansible_user=root
|
||||
|
||||
[turboquant_fleet:vars]
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
|
||||
# Per-host configuration
|
||||
[turboquant_fleet:vars]
|
||||
model_base_path=/opt/models
|
||||
llama_cpp_port=8081
|
||||
turboquant_kv_type=turbo4
|
||||
turboquant_layer_mode=7
|
||||
max_context_tokens=131072
|
||||
|
||||
# Node-specific overrides
|
||||
[timmy:vars]
|
||||
gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
|
||||
max_context_tokens=131072
|
||||
|
||||
[allegro:vars]
|
||||
gemma4_model_filename=gemma-4-E4B-q4_0.gguf
|
||||
max_context_tokens=32768
|
||||
turboquant_kv_type=turbo2
|
||||
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# macOS-specific tasks
|
||||
|
||||
- name: Install Homebrew dependencies
|
||||
homebrew:
|
||||
name:
|
||||
- cmake
|
||||
- git
|
||||
- python@3.11
|
||||
state: present
|
||||
|
||||
- name: Check for Metal support
|
||||
command: system_profiler SPDisplaysDataType
|
||||
register: gpu_info
|
||||
changed_when: false
|
||||
|
||||
- name: Set GPU type
|
||||
set_fact:
|
||||
gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"
|
||||
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
# Debian/Ubuntu-specific tasks
|
||||
|
||||
- name: Install dependencies
|
||||
apt:
|
||||
name:
|
||||
- build-essential
|
||||
- cmake
|
||||
- git
|
||||
- python3
|
||||
- python3-pip
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Check for NVIDIA GPU
|
||||
shell: lspci | grep -i nvidia
|
||||
register: nvidia_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set GPU type
|
||||
set_fact:
|
||||
gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"
|
||||
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
---
|
||||
# Post-deploy integration tests
|
||||
|
||||
- name: Wait for TurboQuant service to be ready
|
||||
uri:
|
||||
url: "http://localhost:{{ llama_cpp_port }}/v1/models"
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: model_check
|
||||
retries: 30
|
||||
delay: 5
|
||||
until: model_check.status == 200
|
||||
|
||||
- name: Test inference with tool call
|
||||
uri:
|
||||
url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
model: "gemma-4"
|
||||
messages:
|
||||
- role: "user"
|
||||
content: "Say 'test passed' and nothing else."
|
||||
max_tokens: 20
|
||||
status_code: 200
|
||||
register: inference_test
|
||||
|
||||
- name: Verify inference response
|
||||
assert:
|
||||
that:
|
||||
- "'choices' in inference_test.json"
|
||||
- "inference_test.json.choices | length > 0"
|
||||
fail_msg: "Inference test failed - no valid response"
|
||||
success_msg: "Inference test passed"
|
||||
|
||||
- name: Report test results
|
||||
debug:
|
||||
msg: "Integration test passed on {{ inventory_hostname }}"
|
||||
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
---
|
||||
# Main tasks for TurboQuant deployment
|
||||
|
||||
- name: Gather OS facts
|
||||
setup:
|
||||
filter: ansible_distribution*
|
||||
|
||||
- name: Include OS-specific tasks
|
||||
include_tasks: "{{ ansible_os_family | lower }}.yml"
|
||||
when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
|
||||
|
||||
- name: Create model directory
|
||||
file:
|
||||
path: "{{ model_base_path }}/gemma4-turboquant"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Clone llama.cpp TurboQuant fork
|
||||
git:
|
||||
repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
|
||||
dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||
version: "{{ turboquant_version }}"
|
||||
register: turboquant_clone
|
||||
|
||||
- name: Build llama.cpp with TurboQuant
|
||||
shell: |
|
||||
cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }} -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }} -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build -j{{ ansible_processor_vcpus }}
|
||||
args:
|
||||
chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||
when: turboquant_clone.changed or force_rebuild | default(false)
|
||||
|
||||
- name: Download Gemma 4 model
|
||||
get_url:
|
||||
url: "{{ gemma4_model_url }}"
|
||||
dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
|
||||
mode: '0644'
|
||||
when: gemma4_model_url is defined
|
||||
|
||||
- name: Deploy TurboQuant server config
|
||||
template:
|
||||
src: server_config.yml.j2
|
||||
dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
|
||||
mode: '0644'
|
||||
|
||||
- name: Deploy systemd service
|
||||
template:
|
||||
src: turboquant.service.j2
|
||||
dest: /etc/systemd/system/turboquant.service
|
||||
mode: '0644'
|
||||
notify: restart turboquant
|
||||
|
||||
- name: Enable and start TurboQuant service
|
||||
systemd:
|
||||
name: turboquant
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
@@ -0,0 +1,24 @@
|
||||
# TurboQuant Server Configuration
|
||||
# Generated by Ansible for {{ inventory_hostname }}
|
||||
|
||||
server:
|
||||
host: "0.0.0.0"
|
||||
port: {{ llama_cpp_port }}
|
||||
model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
|
||||
|
||||
turboquant:
|
||||
enabled: true
|
||||
kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
|
||||
layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
|
||||
|
||||
context:
|
||||
max_tokens: {{ max_context_tokens | default(131072) }}
|
||||
batch_size: {{ batch_size | default(512) }}
|
||||
|
||||
generation:
|
||||
temperature: {{ temperature | default(0.7) }}
|
||||
top_p: {{ top_p | default(0.9) }}
|
||||
top_k: {{ top_k | default(40) }}
|
||||
|
||||
environment:
|
||||
TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"
|
||||
@@ -0,0 +1,19 @@
|
||||
[Unit]
|
||||
Description=TurboQuant Gemma 4 Inference Server
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User={{ turboquant_user | default('root') }}
|
||||
WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
|
||||
Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
|
||||
{% if ansible_architecture == 'arm64' %}
|
||||
Environment=GGML_METAL_DEBUG=0
|
||||
Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
|
||||
{% endif %}
|
||||
ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }} --port {{ llama_cpp_port }} -ctk {{ turboquant_kv_type | default('turbo4') }} -ctv {{ turboquant_kv_type | default('turbo4') }} -c {{ max_context_tokens | default(131072) }} --host 0.0.0.0
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -135,7 +135,5 @@ llama-server -m model.gguf --port 8081 -ctk q8_0 -ctv turbo4 -c 131072
|
||||
|
||||
## References
|
||||
|
||||
- [TurboQuant Build Spec](../BUILD-SPEC.md)
|
||||
- [Phase 1 Report](../PHASE1-REPORT.md)
|
||||
- [Full Knowledge Transfer](../FULL-REPORT.md)
|
||||
- [Project Status](../docs/PROJECT_STATUS.md)
|
||||
- [llama.cpp TurboQuant Fork](https://github.com/TheTom/llama-cpp-turboquant)
|
||||
|
||||
104
tests/roundtrip_test.cpp
Normal file
104
tests/roundtrip_test.cpp
Normal file
@@ -0,0 +1,104 @@
|
||||
#include "llama-turbo.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int kDim = 128;
|
||||
constexpr float kCosineThreshold = 0.99f;
|
||||
constexpr float kZeroTolerance = 1.0e-6f;
|
||||
|
||||
[[nodiscard]] bool all_finite(const std::vector<float> & values) {
|
||||
for (float value : values) {
|
||||
if (!std::isfinite(value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
[[nodiscard]] float max_abs(const std::vector<float> & values) {
|
||||
float best = 0.0f;
|
||||
for (float value : values) {
|
||||
best = std::max(best, std::fabs(value));
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
[[nodiscard]] float cosine_similarity(const std::vector<float> & lhs, const std::vector<float> & rhs) {
|
||||
float dot = 0.0f;
|
||||
float lhs_norm = 0.0f;
|
||||
float rhs_norm = 0.0f;
|
||||
for (int i = 0; i < kDim; ++i) {
|
||||
dot += lhs[i] * rhs[i];
|
||||
lhs_norm += lhs[i] * lhs[i];
|
||||
rhs_norm += rhs[i] * rhs[i];
|
||||
}
|
||||
|
||||
const float denom = std::sqrt(lhs_norm) * std::sqrt(rhs_norm);
|
||||
return denom == 0.0f ? 1.0f : dot / denom;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::vector<float> roundtrip(const std::vector<float> & input, float & norm_out) {
|
||||
std::vector<uint8_t> packed(kDim / 2, 0);
|
||||
norm_out = -1.0f;
|
||||
polar_quant_encode_turbo4(input.data(), packed.data(), &norm_out, kDim);
|
||||
|
||||
std::vector<float> decoded(kDim, 0.0f);
|
||||
polar_quant_decode_turbo4(packed.data(), decoded.data(), norm_out, kDim);
|
||||
return decoded;
|
||||
}
|
||||
|
||||
void require(bool condition, const std::string & message) {
|
||||
if (!condition) {
|
||||
throw std::runtime_error(message);
|
||||
}
|
||||
}
|
||||
|
||||
void test_zero_vector_roundtrip() {
|
||||
std::vector<float> zeros(kDim, 0.0f);
|
||||
float norm = -1.0f;
|
||||
const auto decoded = roundtrip(zeros, norm);
|
||||
|
||||
require(norm == 0.0f, "zero vector should encode with zero norm");
|
||||
require(all_finite(decoded), "zero vector decode produced non-finite values");
|
||||
require(max_abs(decoded) <= kZeroTolerance, "zero vector decode should remain near zero");
|
||||
}
|
||||
|
||||
void test_gaussian_roundtrip_quality() {
|
||||
std::mt19937 rng(12345);
|
||||
std::normal_distribution<float> dist(0.0f, 1.0f);
|
||||
|
||||
std::vector<float> input(kDim, 0.0f);
|
||||
for (float & value : input) {
|
||||
value = dist(rng);
|
||||
}
|
||||
|
||||
float norm = -1.0f;
|
||||
const auto decoded = roundtrip(input, norm);
|
||||
|
||||
require(norm > 0.0f, "random vector should encode with positive norm");
|
||||
require(all_finite(decoded), "random vector decode produced non-finite values");
|
||||
|
||||
const float cosine = cosine_similarity(input, decoded);
|
||||
require(cosine >= kCosineThreshold, "roundtrip cosine similarity below threshold");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int main() {
|
||||
try {
|
||||
test_zero_vector_roundtrip();
|
||||
test_gaussian_roundtrip_quality();
|
||||
std::cout << "PASS: turboquant standalone roundtrip tests\n";
|
||||
return 0;
|
||||
} catch (const std::exception & exc) {
|
||||
std::cerr << "FAIL: " << exc.what() << '\n';
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user