feat: add ansible/README.md for TurboQuant deployment

feat: add ansible/health_check.sh for TurboQuant deployment
feat: add ansible/inventory.ini.example for TurboQuant deployment
2026-04-16 01:51:23 +00:00 · 2026-04-16 01:51:21 +00:00 · 2026-04-16 01:51:19 +00:00 · 2026-04-16 01:51:06 +00:00 · 2026-04-16 01:51:02 +00:00 · 2026-04-16 01:50:59 +00:00
10 changed files with 318 additions and 0 deletions
--- a/ansible/README.md
+++ b/ansible/README.md
@@ -0,0 +1,47 @@
+# TurboQuant Ansible Deployment
+
+Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
+
+## Quick Start
+
+```bash
+# 1. Copy and edit inventory
+cp inventory.ini.example inventory.ini
+vim inventory.ini
+
+# 2. Deploy to all nodes
+ansible-playbook -i inventory.ini deploy_turboquant.yml
+
+# 3. Deploy without integration tests
+ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
+
+# 4. Deploy to specific node
+ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
+```
+
+## Deployment Matrix
+
+| Node | Hardware | Model | Preset |
+|------|----------|-------|--------|
+| Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
+| Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
+
+## Health Check
+
+```bash
+# Check local node
+./health_check.sh localhost 8081
+
+# Check remote node
+./health_check.sh 192.168.1.100 8081
+```
+
+## Role Variables
+
+See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
+
+Key variables:
+- `llama_cpp_port`: Server port (default: 8081)
+- `turboquant_kv_type`: KV cache compression type (default: turbo4)
+- `max_context_tokens`: Maximum context length (default: 131072)
+- `gemma4_model_filename`: Model filename per node
--- a/ansible/deploy_turboquant.yml
+++ b/ansible/deploy_turboquant.yml
@@ -0,0 +1,19 @@
+---
+# Deploy TurboQuant-compressed Gemma 4 across fleet nodes
+# Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
+
+- name: Deploy TurboQuant Gemma 4
+  hosts: turboquant_fleet
+  become: yes
+  vars:
+    turboquant_version: "main"
+    model_base_path: "/opt/models"
+    llama_cpp_port: 8081
+  
+  roles:
+    - turboquant-deploy
+  
+  post_tasks:
+    - name: Run integration tests
+      include_tasks: tasks/integration_test.yml
+      when: run_integration_tests | default(true)
--- a/ansible/health_check.sh
+++ b/ansible/health_check.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# TurboQuant Health Check Script
+# Usage: ./health_check.sh [host] [port]
+
+HOST=${1:-localhost}
+PORT=${2:-8081}
+TIMEOUT=5
+
+echo "=== TurboQuant Health Check ==="
+echo "Host: $HOST:$PORT"
+
+# Check if server is responding
+if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
+    echo "ERROR: Server not responding at $HOST:$PORT"
+    exit 1
+fi
+
+# Get model info
+MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
+if [ -z "$MODELS" ]; then
+    echo "WARNING: No models loaded"
+else
+    echo "Models loaded: $MODELS"
+fi
+
+# Test inference
+RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions"   -H "Content-Type: application/json"   -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
+
+if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
+    echo "Inference: OK"
+    echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
+else
+    echo "ERROR: Inference failed"
+    echo "Response: $RESPONSE"
+    exit 1
+fi
+
+echo "=== Health Check Passed ==="
--- a/ansible/inventory.ini.example
+++ b/ansible/inventory.ini.example
@@ -0,0 +1,33 @@
+# TurboQuant Fleet Inventory
+# Copy to inventory.ini and update with actual hosts
+
+[turboquant_fleet]
+# Mac (Timmy) - M1, 16GB
+timmy ansible_host=192.168.1.100 ansible_user=apayne
+
+# Allegro VPS - 2 cores, 8GB
+allegro ansible_host=167.99.126.228 ansible_user=root
+
+# Ezra VPS - TBD
+# ezra ansible_host=TBD ansible_user=root
+
+[turboquant_fleet:vars]
+ansible_python_interpreter=/usr/bin/python3
+
+# Per-host configuration
+[turboquant_fleet:vars]
+model_base_path=/opt/models
+llama_cpp_port=8081
+turboquant_kv_type=turbo4
+turboquant_layer_mode=7
+max_context_tokens=131072
+
+# Node-specific overrides
+[timmy:vars]
+gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
+max_context_tokens=131072
+
+[allegro:vars]
+gemma4_model_filename=gemma-4-E4B-q4_0.gguf
+max_context_tokens=32768
+turboquant_kv_type=turbo2
--- a/ansible/roles/turboquant-deploy/tasks/darwin.yml
+++ b/ansible/roles/turboquant-deploy/tasks/darwin.yml
@@ -0,0 +1,19 @@
+---
+# macOS-specific tasks
+
+- name: Install Homebrew dependencies
+  homebrew:
+    name:
+      - cmake
+      - git
+      - python@3.11
+    state: present
+
+- name: Check for Metal support
+  command: system_profiler SPDisplaysDataType
+  register: gpu_info
+  changed_when: false
+
+- name: Set GPU type
+  set_fact:
+    gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"
--- a/ansible/roles/turboquant-deploy/tasks/debian.yml
+++ b/ansible/roles/turboquant-deploy/tasks/debian.yml
@@ -0,0 +1,23 @@
+---
+# Debian/Ubuntu-specific tasks
+
+- name: Install dependencies
+  apt:
+    name:
+      - build-essential
+      - cmake
+      - git
+      - python3
+      - python3-pip
+    state: present
+    update_cache: yes
+
+- name: Check for NVIDIA GPU
+  shell: lspci | grep -i nvidia
+  register: nvidia_check
+  failed_when: false
+  changed_when: false
+
+- name: Set GPU type
+  set_fact:
+    gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"
--- a/ansible/roles/turboquant-deploy/tasks/integration_test.yml
+++ b/ansible/roles/turboquant-deploy/tasks/integration_test.yml
@@ -0,0 +1,38 @@
+---
+# Post-deploy integration tests
+
+- name: Wait for TurboQuant service to be ready
+  uri:
+    url: "http://localhost:{{ llama_cpp_port }}/v1/models"
+    method: GET
+    status_code: 200
+  register: model_check
+  retries: 30
+  delay: 5
+  until: model_check.status == 200
+
+- name: Test inference with tool call
+  uri:
+    url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
+    method: POST
+    body_format: json
+    body:
+      model: "gemma-4"
+      messages:
+        - role: "user"
+          content: "Say 'test passed' and nothing else."
+      max_tokens: 20
+    status_code: 200
+  register: inference_test
+
+- name: Verify inference response
+  assert:
+    that:
+      - "'choices' in inference_test.json"
+      - "inference_test.json.choices | length > 0"
+    fail_msg: "Inference test failed - no valid response"
+    success_msg: "Inference test passed"
+
+- name: Report test results
+  debug:
+    msg: "Integration test passed on {{ inventory_hostname }}"
--- a/ansible/roles/turboquant-deploy/tasks/main.yml
+++ b/ansible/roles/turboquant-deploy/tasks/main.yml
@@ -0,0 +1,58 @@
+---
+# Main tasks for TurboQuant deployment
+
+- name: Gather OS facts
+  setup:
+    filter: ansible_distribution*
+
+- name: Include OS-specific tasks
+  include_tasks: "{{ ansible_os_family | lower }}.yml"
+  when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
+
+- name: Create model directory
+  file:
+    path: "{{ model_base_path }}/gemma4-turboquant"
+    state: directory
+    mode: '0755'
+
+- name: Clone llama.cpp TurboQuant fork
+  git:
+    repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
+    dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
+    version: "{{ turboquant_version }}"
+  register: turboquant_clone
+
+- name: Build llama.cpp with TurboQuant
+  shell: |
+    cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }}           -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }}           -DCMAKE_BUILD_TYPE=Release
+    cmake --build build -j{{ ansible_processor_vcpus }}
+  args:
+    chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
+  when: turboquant_clone.changed or force_rebuild | default(false)
+
+- name: Download Gemma 4 model
+  get_url:
+    url: "{{ gemma4_model_url }}"
+    dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
+    mode: '0644'
+  when: gemma4_model_url is defined
+
+- name: Deploy TurboQuant server config
+  template:
+    src: server_config.yml.j2
+    dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
+    mode: '0644'
+
+- name: Deploy systemd service
+  template:
+    src: turboquant.service.j2
+    dest: /etc/systemd/system/turboquant.service
+    mode: '0644'
+  notify: restart turboquant
+
+- name: Enable and start TurboQuant service
+  systemd:
+    name: turboquant
+    enabled: yes
+    state: started
+    daemon_reload: yes
--- a/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
+++ b/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
@@ -0,0 +1,24 @@
+# TurboQuant Server Configuration
+# Generated by Ansible for {{ inventory_hostname }}
+
+server:
+  host: "0.0.0.0"
+  port: {{ llama_cpp_port }}
+  model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
+
+turboquant:
+  enabled: true
+  kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
+  layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
+
+context:
+  max_tokens: {{ max_context_tokens | default(131072) }}
+  batch_size: {{ batch_size | default(512) }}
+
+generation:
+  temperature: {{ temperature | default(0.7) }}
+  top_p: {{ top_p | default(0.9) }}
+  top_k: {{ top_k | default(40) }}
+
+environment:
+  TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"
--- a/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
+++ b/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
@@ -0,0 +1,19 @@
+[Unit]
+Description=TurboQuant Gemma 4 Inference Server
+After=network.target
+
+[Service]
+Type=simple
+User={{ turboquant_user | default('root') }}
+WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
+Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
+{% if ansible_architecture == 'arm64' %}
+Environment=GGML_METAL_DEBUG=0
+Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
+{% endif %}
+ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server   -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}   --port {{ llama_cpp_port }}   -ctk {{ turboquant_kv_type | default('turbo4') }}   -ctv {{ turboquant_kv_type | default('turbo4') }}   -c {{ max_context_tokens | default(131072) }}   --host 0.0.0.0
+Restart=always
+RestartSec=10
+
+[Install]
+WantedBy=multi-user.target
Author	SHA1	Message	Date
Alexander Whitestone	bf68627ea1	feat: add ansible/README.md for TurboQuant deployment All checks were successful Smoke Test / smoke (pull_request) Successful in 15s Details	2026-04-16 01:51:23 +00:00
Alexander Whitestone	dbbed6790f	feat: add ansible/health_check.sh for TurboQuant deployment	2026-04-16 01:51:21 +00:00
Alexander Whitestone	3e304db72a	feat: add ansible/inventory.ini.example for TurboQuant deployment	2026-04-16 01:51:19 +00:00
Alexander Whitestone	0d427b69d3	feat: add ansible/roles/turboquant-deploy/templates/turboquant.service.j2 for TurboQuant deployment	2026-04-16 01:51:06 +00:00
Alexander Whitestone	f1a699a3fc	feat: add ansible/roles/turboquant-deploy/templates/server_config.yml.j2 for TurboQuant deployment	2026-04-16 01:51:02 +00:00
Alexander Whitestone	b9f2da9e19	feat: add ansible/roles/turboquant-deploy/tasks/integration_test.yml for TurboQuant deployment	2026-04-16 01:50:59 +00:00
Alexander Whitestone	5e7c637bbf	feat: add ansible/roles/turboquant-deploy/tasks/debian.yml for TurboQuant deployment	2026-04-16 01:50:53 +00:00
Alexander Whitestone	e46c2b6155	feat: add ansible/roles/turboquant-deploy/tasks/darwin.yml for TurboQuant deployment	2026-04-16 01:50:47 +00:00
Alexander Whitestone	8e9afb34fa	feat: add ansible/roles/turboquant-deploy/tasks/main.yml for TurboQuant deployment	2026-04-16 01:50:42 +00:00
Alexander Whitestone	713db4962e	feat: add ansible/deploy_turboquant.yml for TurboQuant deployment	2026-04-16 01:50:35 +00:00