Compare commits
10 Commits
step35/55-
...
burn/98-17
| Author | SHA1 | Date | |
|---|---|---|---|
| bf68627ea1 | |||
| dbbed6790f | |||
| 3e304db72a | |||
| 0d427b69d3 | |||
| f1a699a3fc | |||
| b9f2da9e19 | |||
| 5e7c637bbf | |||
| e46c2b6155 | |||
| 8e9afb34fa | |||
| 713db4962e |
47
ansible/README.md
Normal file
47
ansible/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# TurboQuant Ansible Deployment
|
||||
|
||||
Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Copy and edit inventory
|
||||
cp inventory.ini.example inventory.ini
|
||||
vim inventory.ini
|
||||
|
||||
# 2. Deploy to all nodes
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml
|
||||
|
||||
# 3. Deploy without integration tests
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
|
||||
|
||||
# 4. Deploy to specific node
|
||||
ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
|
||||
```
|
||||
|
||||
## Deployment Matrix
|
||||
|
||||
| Node | Hardware | Model | Preset |
|
||||
|------|----------|-------|--------|
|
||||
| Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
|
||||
| Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
|
||||
|
||||
## Health Check
|
||||
|
||||
```bash
|
||||
# Check local node
|
||||
./health_check.sh localhost 8081
|
||||
|
||||
# Check remote node
|
||||
./health_check.sh 192.168.1.100 8081
|
||||
```
|
||||
|
||||
## Role Variables
|
||||
|
||||
See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
|
||||
|
||||
Key variables:
|
||||
- `llama_cpp_port`: Server port (default: 8081)
|
||||
- `turboquant_kv_type`: KV cache compression type (default: turbo4)
|
||||
- `max_context_tokens`: Maximum context length (default: 131072)
|
||||
- `gemma4_model_filename`: Model filename per node
|
||||
19
ansible/deploy_turboquant.yml
Normal file
19
ansible/deploy_turboquant.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# Deploy TurboQuant-compressed Gemma 4 across fleet nodes
|
||||
# Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
|
||||
|
||||
- name: Deploy TurboQuant Gemma 4
|
||||
hosts: turboquant_fleet
|
||||
become: yes
|
||||
vars:
|
||||
turboquant_version: "main"
|
||||
model_base_path: "/opt/models"
|
||||
llama_cpp_port: 8081
|
||||
|
||||
roles:
|
||||
- turboquant-deploy
|
||||
|
||||
post_tasks:
|
||||
- name: Run integration tests
|
||||
include_tasks: tasks/integration_test.yml
|
||||
when: run_integration_tests | default(true)
|
||||
38
ansible/health_check.sh
Normal file
38
ansible/health_check.sh
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
# TurboQuant Health Check Script
|
||||
# Usage: ./health_check.sh [host] [port]
|
||||
|
||||
HOST=${1:-localhost}
|
||||
PORT=${2:-8081}
|
||||
TIMEOUT=5
|
||||
|
||||
echo "=== TurboQuant Health Check ==="
|
||||
echo "Host: $HOST:$PORT"
|
||||
|
||||
# Check if server is responding
|
||||
if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
|
||||
echo "ERROR: Server not responding at $HOST:$PORT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get model info
|
||||
MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
|
||||
if [ -z "$MODELS" ]; then
|
||||
echo "WARNING: No models loaded"
|
||||
else
|
||||
echo "Models loaded: $MODELS"
|
||||
fi
|
||||
|
||||
# Test inference
|
||||
RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions" -H "Content-Type: application/json" -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
|
||||
|
||||
if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
|
||||
echo "Inference: OK"
|
||||
echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
|
||||
else
|
||||
echo "ERROR: Inference failed"
|
||||
echo "Response: $RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Health Check Passed ==="
|
||||
33
ansible/inventory.ini.example
Normal file
33
ansible/inventory.ini.example
Normal file
@@ -0,0 +1,33 @@
|
||||
# TurboQuant Fleet Inventory
|
||||
# Copy to inventory.ini and update with actual hosts
|
||||
|
||||
[turboquant_fleet]
|
||||
# Mac (Timmy) - M1, 16GB
|
||||
timmy ansible_host=192.168.1.100 ansible_user=apayne
|
||||
|
||||
# Allegro VPS - 2 cores, 8GB
|
||||
allegro ansible_host=167.99.126.228 ansible_user=root
|
||||
|
||||
# Ezra VPS - TBD
|
||||
# ezra ansible_host=TBD ansible_user=root
|
||||
|
||||
[turboquant_fleet:vars]
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
|
||||
# Per-host configuration
|
||||
[turboquant_fleet:vars]
|
||||
model_base_path=/opt/models
|
||||
llama_cpp_port=8081
|
||||
turboquant_kv_type=turbo4
|
||||
turboquant_layer_mode=7
|
||||
max_context_tokens=131072
|
||||
|
||||
# Node-specific overrides
|
||||
[timmy:vars]
|
||||
gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
|
||||
max_context_tokens=131072
|
||||
|
||||
[allegro:vars]
|
||||
gemma4_model_filename=gemma-4-E4B-q4_0.gguf
|
||||
max_context_tokens=32768
|
||||
turboquant_kv_type=turbo2
|
||||
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
19
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# macOS-specific tasks
|
||||
|
||||
- name: Install Homebrew dependencies
|
||||
homebrew:
|
||||
name:
|
||||
- cmake
|
||||
- git
|
||||
- python@3.11
|
||||
state: present
|
||||
|
||||
- name: Check for Metal support
|
||||
command: system_profiler SPDisplaysDataType
|
||||
register: gpu_info
|
||||
changed_when: false
|
||||
|
||||
- name: Set GPU type
|
||||
set_fact:
|
||||
gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"
|
||||
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
23
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
# Debian/Ubuntu-specific tasks
|
||||
|
||||
- name: Install dependencies
|
||||
apt:
|
||||
name:
|
||||
- build-essential
|
||||
- cmake
|
||||
- git
|
||||
- python3
|
||||
- python3-pip
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Check for NVIDIA GPU
|
||||
shell: lspci | grep -i nvidia
|
||||
register: nvidia_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
|
||||
- name: Set GPU type
|
||||
set_fact:
|
||||
gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"
|
||||
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
38
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
@@ -0,0 +1,38 @@
|
||||
---
|
||||
# Post-deploy integration tests
|
||||
|
||||
- name: Wait for TurboQuant service to be ready
|
||||
uri:
|
||||
url: "http://localhost:{{ llama_cpp_port }}/v1/models"
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: model_check
|
||||
retries: 30
|
||||
delay: 5
|
||||
until: model_check.status == 200
|
||||
|
||||
- name: Test inference with tool call
|
||||
uri:
|
||||
url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
model: "gemma-4"
|
||||
messages:
|
||||
- role: "user"
|
||||
content: "Say 'test passed' and nothing else."
|
||||
max_tokens: 20
|
||||
status_code: 200
|
||||
register: inference_test
|
||||
|
||||
- name: Verify inference response
|
||||
assert:
|
||||
that:
|
||||
- "'choices' in inference_test.json"
|
||||
- "inference_test.json.choices | length > 0"
|
||||
fail_msg: "Inference test failed - no valid response"
|
||||
success_msg: "Inference test passed"
|
||||
|
||||
- name: Report test results
|
||||
debug:
|
||||
msg: "Integration test passed on {{ inventory_hostname }}"
|
||||
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
58
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
---
|
||||
# Main tasks for TurboQuant deployment
|
||||
|
||||
- name: Gather OS facts
|
||||
setup:
|
||||
filter: ansible_distribution*
|
||||
|
||||
- name: Include OS-specific tasks
|
||||
include_tasks: "{{ ansible_os_family | lower }}.yml"
|
||||
when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
|
||||
|
||||
- name: Create model directory
|
||||
file:
|
||||
path: "{{ model_base_path }}/gemma4-turboquant"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Clone llama.cpp TurboQuant fork
|
||||
git:
|
||||
repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
|
||||
dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||
version: "{{ turboquant_version }}"
|
||||
register: turboquant_clone
|
||||
|
||||
- name: Build llama.cpp with TurboQuant
|
||||
shell: |
|
||||
cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }} -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }} -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build -j{{ ansible_processor_vcpus }}
|
||||
args:
|
||||
chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
|
||||
when: turboquant_clone.changed or force_rebuild | default(false)
|
||||
|
||||
- name: Download Gemma 4 model
|
||||
get_url:
|
||||
url: "{{ gemma4_model_url }}"
|
||||
dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
|
||||
mode: '0644'
|
||||
when: gemma4_model_url is defined
|
||||
|
||||
- name: Deploy TurboQuant server config
|
||||
template:
|
||||
src: server_config.yml.j2
|
||||
dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
|
||||
mode: '0644'
|
||||
|
||||
- name: Deploy systemd service
|
||||
template:
|
||||
src: turboquant.service.j2
|
||||
dest: /etc/systemd/system/turboquant.service
|
||||
mode: '0644'
|
||||
notify: restart turboquant
|
||||
|
||||
- name: Enable and start TurboQuant service
|
||||
systemd:
|
||||
name: turboquant
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
@@ -0,0 +1,24 @@
|
||||
# TurboQuant Server Configuration
|
||||
# Generated by Ansible for {{ inventory_hostname }}
|
||||
|
||||
server:
|
||||
host: "0.0.0.0"
|
||||
port: {{ llama_cpp_port }}
|
||||
model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
|
||||
|
||||
turboquant:
|
||||
enabled: true
|
||||
kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
|
||||
layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
|
||||
|
||||
context:
|
||||
max_tokens: {{ max_context_tokens | default(131072) }}
|
||||
batch_size: {{ batch_size | default(512) }}
|
||||
|
||||
generation:
|
||||
temperature: {{ temperature | default(0.7) }}
|
||||
top_p: {{ top_p | default(0.9) }}
|
||||
top_k: {{ top_k | default(40) }}
|
||||
|
||||
environment:
|
||||
TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"
|
||||
@@ -0,0 +1,19 @@
|
||||
[Unit]
|
||||
Description=TurboQuant Gemma 4 Inference Server
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User={{ turboquant_user | default('root') }}
|
||||
WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
|
||||
Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
|
||||
{% if ansible_architecture == 'arm64' %}
|
||||
Environment=GGML_METAL_DEBUG=0
|
||||
Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
|
||||
{% endif %}
|
||||
ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }} --port {{ llama_cpp_port }} -ctk {{ turboquant_kv_type | default('turbo4') }} -ctv {{ turboquant_kv_type | default('turbo4') }} -c {{ max_context_tokens | default(131072) }} --host 0.0.0.0
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user