Compare commits
10 Commits
step35/67-
...
burn/98-17
| Author | SHA1 | Date | |
|---|---|---|---|
| bf68627ea1 | |||
| dbbed6790f | |||
| 3e304db72a | |||
| 0d427b69d3 | |||
| f1a699a3fc | |||
| b9f2da9e19 | |||
| 5e7c637bbf | |||
| e46c2b6155 | |||
| 8e9afb34fa | |||
| 713db4962e |
47
ansible/README.md
Normal file
47
ansible/README.md
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# TurboQuant Ansible Deployment
|
||||||
|
|
||||||
|
Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Copy and edit inventory
|
||||||
|
cp inventory.ini.example inventory.ini
|
||||||
|
vim inventory.ini
|
||||||
|
|
||||||
|
# 2. Deploy to all nodes
|
||||||
|
ansible-playbook -i inventory.ini deploy_turboquant.yml
|
||||||
|
|
||||||
|
# 3. Deploy without integration tests
|
||||||
|
ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
|
||||||
|
|
||||||
|
# 4. Deploy to specific node
|
||||||
|
ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deployment Matrix
|
||||||
|
|
||||||
|
| Node | Hardware | Model | Preset |
|
||||||
|
|------|----------|-------|--------|
|
||||||
|
| Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
|
||||||
|
| Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
|
||||||
|
|
||||||
|
## Health Check
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check local node
|
||||||
|
./health_check.sh localhost 8081
|
||||||
|
|
||||||
|
# Check remote node
|
||||||
|
./health_check.sh 192.168.1.100 8081
|
||||||
|
```
|
||||||
|
|
||||||
|
## Role Variables
|
||||||
|
|
||||||
|
See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
|
||||||
|
|
||||||
|
Key variables:
|
||||||
|
- `llama_cpp_port`: Server port (default: 8081)
|
||||||
|
- `turboquant_kv_type`: KV cache compression type (default: turbo4)
|
||||||
|
- `max_context_tokens`: Maximum context length (default: 131072)
|
||||||
|
- `gemma4_model_filename`: Model filename per node
|
||||||
19
ansible/deploy_turboquant.yml
Normal file
19
ansible/deploy_turboquant.yml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
# Deploy TurboQuant-compressed Gemma 4 across fleet nodes
|
||||||
|
# Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
|
||||||
|
|
||||||
|
- name: Deploy TurboQuant Gemma 4
|
||||||
|
hosts: turboquant_fleet
|
||||||
|
become: yes
|
||||||
|
vars:
|
||||||
|
turboquant_version: "main"
|
||||||
|
model_base_path: "/opt/models"
|
||||||
|
llama_cpp_port: 8081
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- turboquant-deploy
|
||||||
|
|
||||||
|
post_tasks:
|
||||||
|
- name: Run integration tests
|
||||||
|
include_tasks: tasks/integration_test.yml
|
||||||
|
when: run_integration_tests | default(true)
|
||||||
38
ansible/health_check.sh
Normal file
38
ansible/health_check.sh
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# TurboQuant Health Check Script
|
||||||
|
# Usage: ./health_check.sh [host] [port]
|
||||||
|
|
||||||
|
HOST=${1:-localhost}
|
||||||
|
PORT=${2:-8081}
|
||||||
|
TIMEOUT=5
|
||||||
|
|
||||||
|
echo "=== TurboQuant Health Check ==="
|
||||||
|
echo "Host: $HOST:$PORT"
|
||||||
|
|
||||||
|
# Check if server is responding
|
||||||
|
if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
|
||||||
|
echo "ERROR: Server not responding at $HOST:$PORT"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get model info
|
||||||
|
MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
|
||||||
|
if [ -z "$MODELS" ]; then
|
||||||
|
echo "WARNING: No models loaded"
|
||||||
|
else
|
||||||
|
echo "Models loaded: $MODELS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test inference
|
||||||
|
RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions" -H "Content-Type: application/json" -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
|
||||||
|
|
||||||
|
if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
|
||||||
|
echo "Inference: OK"
|
||||||
|
echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
|
||||||
|
else
|
||||||
|
echo "ERROR: Inference failed"
|
||||||
|
echo "Response: $RESPONSE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== Health Check Passed ==="
|
||||||
33
ansible/inventory.ini.example
Normal file
33
ansible/inventory.ini.example
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# TurboQuant Fleet Inventory
|
||||||
|
# Copy to inventory.ini and update with actual hosts
|
||||||
|
|
||||||
|
[turboquant_fleet]
|
||||||
|
# Mac (Timmy) - M1, 16GB
|
||||||
|
timmy ansible_host=192.168.1.100 ansible_user=apayne
|
||||||
|
|
||||||
|
# Allegro VPS - 2 cores, 8GB
|
||||||
|
allegro ansible_host=167.99.126.228 ansible_user=root
|
||||||
|
|
||||||
|
# Ezra VPS - TBD
|
||||||
|
# ezra ansible_host=TBD ansible_user=root
|
||||||
|
|
||||||
|
[turboquant_fleet:vars]
|
||||||
|
ansible_python_interpreter=/usr/bin/python3
|
||||||
|
|
||||||
|
# Per-host configuration
|
||||||
|
[turboquant_fleet:vars]
|
||||||
|
model_base_path=/opt/models
|
||||||
|
llama_cpp_port=8081
|
||||||
|
turboquant_kv_type=turbo4
|
||||||
|
turboquant_layer_mode=7
|
||||||
|
max_context_tokens=131072
|
||||||
|
|
||||||
|
# Node-specific overrides
|
||||||
|
[timmy:vars]
|
||||||
|
gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
|
||||||
|
max_context_tokens=131072
|
||||||
|
|
||||||
|
[allegro:vars]
|
||||||
|
gemma4_model_filename=gemma-4-E4B-q4_0.gguf
|
||||||
|
max_context_tokens=32768
|
||||||
|
turboquant_kv_type=turbo2
|
||||||
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
# macOS-specific tasks
|
||||||
|
|
||||||
|
- name: Install Homebrew dependencies
|
||||||
|
homebrew:
|
||||||
|
name:
|
||||||
|
- cmake
|
||||||
|
- git
|
||||||
|
- python@3.11
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Check for Metal support
|
||||||
|
command: system_profiler SPDisplaysDataType
|
||||||
|
register: gpu_info
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set GPU type
|
||||||
|
set_fact:
|
||||||
|
gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"
|
||||||
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
# Debian/Ubuntu-specific tasks
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- build-essential
|
||||||
|
- cmake
|
||||||
|
- git
|
||||||
|
- python3
|
||||||
|
- python3-pip
|
||||||
|
state: present
|
||||||
|
update_cache: yes
|
||||||
|
|
||||||
|
- name: Check for NVIDIA GPU
|
||||||
|
shell: lspci | grep -i nvidia
|
||||||
|
register: nvidia_check
|
||||||
|
failed_when: false
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set GPU type
|
||||||
|
set_fact:
|
||||||
|
gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"
|
||||||
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
---
|
||||||
|
# Post-deploy integration tests
|
||||||
|
|
||||||
|
- name: Wait for TurboQuant service to be ready
|
||||||
|
uri:
|
||||||
|
url: "http://localhost:{{ llama_cpp_port }}/v1/models"
|
||||||
|
method: GET
|
||||||
|
status_code: 200
|
||||||
|
register: model_check
|
||||||
|
retries: 30
|
||||||
|
delay: 5
|
||||||
|
until: model_check.status == 200
|
||||||
|
|
||||||
|
- name: Test inference with tool call
|
||||||
|
uri:
|
||||||
|
url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
|
||||||
|
method: POST
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
model: "gemma-4"
|
||||||
|
messages:
|
||||||
|
- role: "user"
|
||||||
|
content: "Say 'test passed' and nothing else."
|
||||||
|
max_tokens: 20
|
||||||
|
status_code: 200
|
||||||
|
register: inference_test
|
||||||
|
|
||||||
|
- name: Verify inference response
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- "'choices' in inference_test.json"
|
||||||
|
- "inference_test.json.choices | length > 0"
|
||||||
|
fail_msg: "Inference test failed - no valid response"
|
||||||
|
success_msg: "Inference test passed"
|
||||||
|
|
||||||
|
- name: Report test results
|
||||||
|
debug:
|
||||||
|
msg: "Integration test passed on {{ inventory_hostname }}"
|
||||||
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
# Main tasks for TurboQuant deployment
|
||||||
|
|
||||||
|
- name: Gather OS facts
|
||||||
|
setup:
|
||||||
|
filter: ansible_distribution*
|
||||||
|
|
||||||
|
- name: Include OS-specific tasks
|
||||||
|
include_tasks: "{{ ansible_os_family | lower }}.yml"
|
||||||
|
when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
|
||||||
|
|
||||||
|
- name: Create model directory
|
||||||
|
file:
|
||||||
|
path: "{{ model_base_path }}/gemma4-turboquant"
|
||||||
|
state: directory
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Clone llama.cpp TurboQuant fork
|
||||||
|
git:
|
||||||
|
repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
|
||||||
|
dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||||
|
version: "{{ turboquant_version }}"
|
||||||
|
register: turboquant_clone
|
||||||
|
|
||||||
|
- name: Build llama.cpp with TurboQuant
|
||||||
|
shell: |
|
||||||
|
cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }} -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }} -DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build -j{{ ansible_processor_vcpus }}
|
||||||
|
args:
|
||||||
|
chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||||
|
when: turboquant_clone.changed or force_rebuild | default(false)
|
||||||
|
|
||||||
|
- name: Download Gemma 4 model
|
||||||
|
get_url:
|
||||||
|
url: "{{ gemma4_model_url }}"
|
||||||
|
dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
|
||||||
|
mode: '0644'
|
||||||
|
when: gemma4_model_url is defined
|
||||||
|
|
||||||
|
- name: Deploy TurboQuant server config
|
||||||
|
template:
|
||||||
|
src: server_config.yml.j2
|
||||||
|
dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Deploy systemd service
|
||||||
|
template:
|
||||||
|
src: turboquant.service.j2
|
||||||
|
dest: /etc/systemd/system/turboquant.service
|
||||||
|
mode: '0644'
|
||||||
|
notify: restart turboquant
|
||||||
|
|
||||||
|
- name: Enable and start TurboQuant service
|
||||||
|
systemd:
|
||||||
|
name: turboquant
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
# TurboQuant Server Configuration
|
||||||
|
# Generated by Ansible for {{ inventory_hostname }}
|
||||||
|
|
||||||
|
server:
|
||||||
|
host: "0.0.0.0"
|
||||||
|
port: {{ llama_cpp_port }}
|
||||||
|
model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
|
||||||
|
|
||||||
|
turboquant:
|
||||||
|
enabled: true
|
||||||
|
kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
|
||||||
|
layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
|
||||||
|
|
||||||
|
context:
|
||||||
|
max_tokens: {{ max_context_tokens | default(131072) }}
|
||||||
|
batch_size: {{ batch_size | default(512) }}
|
||||||
|
|
||||||
|
generation:
|
||||||
|
temperature: {{ temperature | default(0.7) }}
|
||||||
|
top_p: {{ top_p | default(0.9) }}
|
||||||
|
top_k: {{ top_k | default(40) }}
|
||||||
|
|
||||||
|
environment:
|
||||||
|
TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=TurboQuant Gemma 4 Inference Server
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User={{ turboquant_user | default('root') }}
|
||||||
|
WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
|
||||||
|
Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
|
||||||
|
{% if ansible_architecture == 'arm64' %}
|
||||||
|
Environment=GGML_METAL_DEBUG=0
|
||||||
|
Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
|
||||||
|
{% endif %}
|
||||||
|
ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }} --port {{ llama_cpp_port }} -ctk {{ turboquant_kv_type | default('turbo4') }} -ctv {{ turboquant_kv_type | default('turbo4') }} -c {{ max_context_tokens | default(131072) }} --host 0.0.0.0
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
Reference in New Issue
Block a user