Compare commits

...

10 Commits

10 changed files with 318 additions and 0 deletions

47
ansible/README.md Normal file
View File

@@ -0,0 +1,47 @@
# TurboQuant Ansible Deployment
Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
## Quick Start
```bash
# 1. Copy and edit inventory
cp inventory.ini.example inventory.ini
vim inventory.ini
# 2. Deploy to all nodes
ansible-playbook -i inventory.ini deploy_turboquant.yml
# 3. Deploy without integration tests
ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
# 4. Deploy to specific node
ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
```
## Deployment Matrix
| Node | Hardware | Model | Preset |
|------|----------|-------|--------|
| Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
| Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
## Health Check
```bash
# Check local node
./health_check.sh localhost 8081
# Check remote node
./health_check.sh 192.168.1.100 8081
```
## Role Variables
See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
Key variables:
- `llama_cpp_port`: Server port (default: 8081)
- `turboquant_kv_type`: KV cache compression type (default: turbo4)
- `max_context_tokens`: Maximum context length (default: 131072)
- `gemma4_model_filename`: Model filename per node

View File

@@ -0,0 +1,19 @@
---
# Deploy TurboQuant-compressed Gemma 4 across fleet nodes
# Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
- name: Deploy TurboQuant Gemma 4
hosts: turboquant_fleet
become: yes
vars:
turboquant_version: "main"
model_base_path: "/opt/models"
llama_cpp_port: 8081
roles:
- turboquant-deploy
post_tasks:
- name: Run integration tests
include_tasks: tasks/integration_test.yml
when: run_integration_tests | default(true)

38
ansible/health_check.sh Normal file
View File

@@ -0,0 +1,38 @@
#!/bin/bash
# TurboQuant Health Check Script
# Usage: ./health_check.sh [host] [port]
HOST=${1:-localhost}
PORT=${2:-8081}
TIMEOUT=5
echo "=== TurboQuant Health Check ==="
echo "Host: $HOST:$PORT"
# Check if server is responding
if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
echo "ERROR: Server not responding at $HOST:$PORT"
exit 1
fi
# Get model info
MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
if [ -z "$MODELS" ]; then
echo "WARNING: No models loaded"
else
echo "Models loaded: $MODELS"
fi
# Test inference
RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions" -H "Content-Type: application/json" -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
echo "Inference: OK"
echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
else
echo "ERROR: Inference failed"
echo "Response: $RESPONSE"
exit 1
fi
echo "=== Health Check Passed ==="

View File

@@ -0,0 +1,33 @@
# TurboQuant Fleet Inventory
# Copy to inventory.ini and update with actual hosts
[turboquant_fleet]
# Mac (Timmy) - M1, 16GB
timmy ansible_host=192.168.1.100 ansible_user=apayne
# Allegro VPS - 2 cores, 8GB
allegro ansible_host=167.99.126.228 ansible_user=root
# Ezra VPS - TBD
# ezra ansible_host=TBD ansible_user=root
[turboquant_fleet:vars]
ansible_python_interpreter=/usr/bin/python3
# Per-host configuration
[turboquant_fleet:vars]
model_base_path=/opt/models
llama_cpp_port=8081
turboquant_kv_type=turbo4
turboquant_layer_mode=7
max_context_tokens=131072
# Node-specific overrides
[timmy:vars]
gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
max_context_tokens=131072
[allegro:vars]
gemma4_model_filename=gemma-4-E4B-q4_0.gguf
max_context_tokens=32768
turboquant_kv_type=turbo2

View File

@@ -0,0 +1,19 @@
---
# macOS-specific tasks
- name: Install Homebrew dependencies
homebrew:
name:
- cmake
- git
- python@3.11
state: present
- name: Check for Metal support
command: system_profiler SPDisplaysDataType
register: gpu_info
changed_when: false
- name: Set GPU type
set_fact:
gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"

View File

@@ -0,0 +1,23 @@
---
# Debian/Ubuntu-specific tasks
- name: Install dependencies
apt:
name:
- build-essential
- cmake
- git
- python3
- python3-pip
state: present
update_cache: yes
- name: Check for NVIDIA GPU
shell: lspci | grep -i nvidia
register: nvidia_check
failed_when: false
changed_when: false
- name: Set GPU type
set_fact:
gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"

View File

@@ -0,0 +1,38 @@
---
# Post-deploy integration tests
- name: Wait for TurboQuant service to be ready
uri:
url: "http://localhost:{{ llama_cpp_port }}/v1/models"
method: GET
status_code: 200
register: model_check
retries: 30
delay: 5
until: model_check.status == 200
- name: Test inference with tool call
uri:
url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
method: POST
body_format: json
body:
model: "gemma-4"
messages:
- role: "user"
content: "Say 'test passed' and nothing else."
max_tokens: 20
status_code: 200
register: inference_test
- name: Verify inference response
assert:
that:
- "'choices' in inference_test.json"
- "inference_test.json.choices | length > 0"
fail_msg: "Inference test failed - no valid response"
success_msg: "Inference test passed"
- name: Report test results
debug:
msg: "Integration test passed on {{ inventory_hostname }}"

View File

@@ -0,0 +1,58 @@
---
# Main tasks for TurboQuant deployment
- name: Gather OS facts
setup:
filter: ansible_distribution*
- name: Include OS-specific tasks
include_tasks: "{{ ansible_os_family | lower }}.yml"
when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
- name: Create model directory
file:
path: "{{ model_base_path }}/gemma4-turboquant"
state: directory
mode: '0755'
- name: Clone llama.cpp TurboQuant fork
git:
repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
version: "{{ turboquant_version }}"
register: turboquant_clone
- name: Build llama.cpp with TurboQuant
shell: |
cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }} -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }} -DCMAKE_BUILD_TYPE=Release
cmake --build build -j{{ ansible_processor_vcpus }}
args:
chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
when: turboquant_clone.changed or force_rebuild | default(false)
- name: Download Gemma 4 model
get_url:
url: "{{ gemma4_model_url }}"
dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
mode: '0644'
when: gemma4_model_url is defined
- name: Deploy TurboQuant server config
template:
src: server_config.yml.j2
dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
mode: '0644'
- name: Deploy systemd service
template:
src: turboquant.service.j2
dest: /etc/systemd/system/turboquant.service
mode: '0644'
notify: restart turboquant
- name: Enable and start TurboQuant service
systemd:
name: turboquant
enabled: yes
state: started
daemon_reload: yes

View File

@@ -0,0 +1,24 @@
# TurboQuant Server Configuration
# Generated by Ansible for {{ inventory_hostname }}
server:
host: "0.0.0.0"
port: {{ llama_cpp_port }}
model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
turboquant:
enabled: true
kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
context:
max_tokens: {{ max_context_tokens | default(131072) }}
batch_size: {{ batch_size | default(512) }}
generation:
temperature: {{ temperature | default(0.7) }}
top_p: {{ top_p | default(0.9) }}
top_k: {{ top_k | default(40) }}
environment:
TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"

View File

@@ -0,0 +1,19 @@
[Unit]
Description=TurboQuant Gemma 4 Inference Server
After=network.target
[Service]
Type=simple
User={{ turboquant_user | default('root') }}
WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
{% if ansible_architecture == 'arm64' %}
Environment=GGML_METAL_DEBUG=0
Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
{% endif %}
ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }} --port {{ llama_cpp_port }} -ctk {{ turboquant_kv_type | default('turbo4') }} -ctv {{ turboquant_kv_type | default('turbo4') }} -c {{ max_context_tokens | default(131072) }} --host 0.0.0.0
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target