deploy: Ansible role for TurboQuant-compressed Gemma 4 across fleet nodes (#98 )

- Adds ansible/ deploy_turboquant.yml playbook with per-node config - Adds turboquant-deploy role: OS-specific (darwin/debian) tasks - Adds health_check.sh and integration test (chat completion) - Adds inventory.ini.example with Mac/Allegro/Ezra groups - Deploys llama.cpp with TurboQuant (Metal on macOS) - Systemd service (Linux) with TURBO_LAYER_ADAPTIVE env
2026-04-26 06:55:35 -04:00
10 changed files with 405 additions and 0 deletions
--- a/ansible/README.md
+++ b/ansible/README.md
@@ -0,0 +1,19 @@
+# TurboQuant Ansible Deployment
+
+Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
+
+## Quick Start
+
+```bash
+# Copy and edit inventory
+cp ansible/inventory.ini.example ansible/inventory.ini
+
+# Deploy to all nodes
+ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
+
+# Run health check
+ansible -i ansible/inventory.ini all -m shell -a "sudo /opt/turboquant/health_check.sh"
+
+# Run integration test
+ansible -i ansible/inventory.ini all -m shell -a "curl -s http://localhost:8081/v1/chat/completions -d '{\"model\":\"gemma-4\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'"
+```
--- a/ansible/deploy_turboquant.yml
+++ b/ansible/deploy_turboquant.yml
@@ -0,0 +1,69 @@
+---
+# deploy_turboquant.yml — Deploy TurboQuant across fleet nodes
+# Usage: ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
+
+- name: Deploy TurboQuant to Mac (local)
+  hosts: mac
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 131072
+    turboquant_model: "gemma-4"
+    turboquant_model_file: "gemma-4-26B-A4B.gguf"
+    turboquant_kv_type: "turbo4"
+    turboquant_layer_adaptive: 7
+    node_preset: "turboquant_k8v4"
+    node_hardware: "M1-16GB"
+
+  roles:
+    - turboquant-deploy
+
+- name: Deploy TurboQuant to Allegro VPS
+  hosts: allegro
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 65536
+    turboquant_model: "gemma-4-E4B"
+    turboquant_model_file: "gemma-4-E4B.gguf"
+    turboquant_kv_type: "q4_0"
+    turboquant_layer_adaptive: 0
+    node_preset: "turboquant_4bit_nc"
+    node_hardware: "VPS-2c8g"
+
+  roles:
+    - turboquant-deploy
+
+- name: Deploy TurboQuant to Ezra VPS
+  hosts: ezra
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 65536
+    turboquant_model: "gemma-4-E4B"
+    turboquant_model_file: "gemma-4-E4B.gguf"
+    turboquant_kv_type: "q4_0"
+    turboquant_layer_adaptive: 0
+    node_preset: "turboquant_4bit_nc"
+    node_hardware: "VPS-2c8g"
+
+  roles:
+    - turboquant-deploy
--- a/ansible/health_check.sh
+++ b/ansible/health_check.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Health check for TurboQuant llama-server / vLLM deployment
+
+set -e
+
+PORT="${TURBOQUANT_PORT:-8081}"
+ENDPOINT="${TURBOQUANT_ENDPOINT:-http://localhost:${PORT}/v1/models}"
+
+echo "Checking TurboQuant server health at ${ENDPOINT}..."
+
+if command -v curl &> /dev/null; then
+    response=$(curl -s -o /dev/null -w "%{http_code}" "${ENDPOINT}" --max-time 10)
+    if [ "${response}" = "200" ]; then
+        echo "✅ Server healthy — HTTP ${response}"
+        exit 0
+    else
+        echo "❌ Server unhealthy — HTTP ${response}"
+        exit 1
+    fi
+else
+    echo "curl not found; cannot perform health check"
+    exit 2
+fi
--- a/ansible/inventory.ini.example
+++ b/ansible/inventory.ini.example
@@ -0,0 +1,22 @@
+# Ansible inventory for TurboQuant fleet deployment
+# Edit this file and save as ansible/inventory.ini before running
+
+[mac]
+# Local MacBook — runs llama-server with Metal + TurboQuant
+timmy-mac ansible_host=localhost ansible_connection=local
+
+[allegro]
+# Allegro VPS — Debian, runs llama-server or vLLM with GGUF q4_0
+allegro-primary ansible_host=167.99.126.228 ansible_user=root
+
+[ezra]
+# Ezra VPS — Ubuntu, runs llama-server or vLLM
+ezra-primary ansible_host=143.198.27.163 ansible_user=root ansible_connection=local
+
+[turbonodes:children]
+mac
+allegro
+ezra
+
+[turbonodes:vars]
+ansible_python_interpreter=/usr/bin/python3
--- a/ansible/roles/turboquant-deploy/tasks/darwin.yml
+++ b/ansible/roles/turboquant-deploy/tasks/darwin.yml
@@ -0,0 +1,68 @@
+---
+# macOS deployment — builds llama.cpp with Metal + TurboQuant
+
+- name: Ensure Xcode command line tools are installed
+  command: xcode-select -p
+  register: xcode_check
+  changed_when: false
+  failed_when: false
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, darwin]
+
+- name: Install Xcode CLI tools if missing (macOS)
+  shell: xcode-select --install
+  when: ansible_os_family == "Darwin" and xcode_check.rc != 0
+  tags: [turboquant, darwin]
+
+- name: Check for Git
+  command: which git
+  register: git_check
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deps]
+
+- name: Clone llama.cpp TurboQuant fork
+  git:
+    repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
+    dest: "{{ turboquant_install_dir }}/llama.cpp"
+    version: "feature/turboquant-kv-cache"
+    force: yes
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, source]
+
+- name: Build llama.cpp with Metal + TurboQuant
+  shell: |
+    cd {{ turboquant_install_dir }}/llama.cpp
+    cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_METAL=ON
+    cmake --build build -j$(sysctl -n hw.ncpu)
+  args:
+    creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, build]
+
+- name: Create models directory
+  file:
+    path: "{{ turboquant_install_dir }}/models"
+    state: directory
+    mode: '0755'
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
+
+- name: Deploy health check script
+  copy:
+    src: "../../health_check.sh"
+    dest: "{{ turboquant_install_dir }}/health_check.sh"
+    mode: '0755'
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
+
+- name: Print macOS manual start instructions
+  debug:
+    msg: |
+      Mac deployment complete. To start the server manually:
+        export TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}
+        sudo -u {{ turboquant_user }} {{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server \
+          -m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
+          --host {{ turboquant_host }} --port {{ turboquant_port }} \
+          -c {{ turboquant_context }} -ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }}
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
--- a/ansible/roles/turboquant-deploy/tasks/debian.yml
+++ b/ansible/roles/turboquant-deploy/tasks/debian.yml
@@ -0,0 +1,92 @@
+---
+# Debian/Ubuntu deployment — installs llama.cpp with TurboQuant, uses systemd
+
+- name: Update apt cache
+  apt:
+    update_cache: yes
+    cache_valid_time: 3600
+  tags: [turboquant, deps]
+
+- name: Install build dependencies
+  apt:
+    name:
+      - build-essential
+      - cmake
+      - git
+      - curl
+      - python3
+      - python3-pip
+      - python3-venv
+    state: present
+  tags: [turboquant, deps]
+
+- name: Create turboquant user
+  user:
+    name: "{{ turboquant_user }}"
+    system: yes
+    shell: /usr/sbin/nologin
+    create_home: no
+  tags: [turboquant, prereq]
+
+- name: Create install directory
+  file:
+    path: "{{ turboquant_install_dir }}"
+    state: directory
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, prereq]
+
+- name: Clone llama.cpp TurboQuant fork
+  git:
+    repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
+    dest: "{{ turboquant_install_dir }}/llama.cpp"
+    version: "feature/turboquant-kv-cache"
+    force: yes
+  tags: [turboquant, source]
+
+- name: Build llama.cpp with TurboQuant
+  shell: |
+    cd {{ turboquant_install_dir }}/llama.cpp
+    cmake -B build -DCMAKE_BUILD_TYPE=Release
+    cmake --build build -j$(nproc)
+  args:
+    creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
+  tags: [turboquant, build]
+
+- name: Create models directory
+  file:
+    path: "{{ turboquant_install_dir }}/models"
+    state: directory
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, deploy]
+
+- name: Deploy systemd service unit
+  template:
+    src: turboquant.service.j2
+    dest: /etc/systemd/system/{{ turboquant_service_name }}.service
+    mode: '0644'
+  tags: [turboquant, service]
+
+- name: Reload systemd daemon
+  systemd:
+    daemon_reload: yes
+  tags: [turboquant, service]
+
+- name: Enable and start TurboQuant service
+  systemd:
+    name: "{{ turboquant_service_name }}"
+    state: started
+    enabled: yes
+  tags: [turboquant, service]
+
+- name: Deploy health check script
+  copy:
+    src: "../../health_check.sh"
+    dest: "{{ turboquant_install_dir }}/health_check.sh"
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, deploy]
--- a/ansible/roles/turboquant-deploy/tasks/integration_test.yml
+++ b/ansible/roles/turboquant-deploy/tasks/integration_test.yml
@@ -0,0 +1,45 @@
+---
+# Integration test — verify server responds to a simple query
+
+- name: Wait for service to be ready (HTTP 200 on /v1/models)
+  uri:
+    url: "http://localhost:{{ turboquant_port }}/v1/models"
+    method: GET
+    status_code: 200
+  register: svc_ready
+  retries: 12
+  delay: 5
+  until: svc_ready.status == 200
+  when: ansible_os_family != "Darwin"  # skip on mac for now; service starts manually
+  tags: [turboquant, healthcheck]
+
+- name: Run integration test — simple query
+  uri:
+    url: "http://localhost:{{ turboquant_port }}/v1/chat/completions"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ turboquant_model }}"
+      messages:
+        - role: "user"
+          content: "Test: 2+2 equals what? Answer with only the number."
+      max_tokens: 5
+      temperature: 0.0
+    return_content: yes
+  register: completion
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
+
+- name: Verify response contains expected answer
+  assert:
+    that:
+      - "'4' in (completion.content | default(''))"
+      - completion.status == 200
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
+
+- name: Log integration result
+  debug:
+    msg: "Integration test passed — TurboQuant server responded correctly"
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
--- a/ansible/roles/turboquant-deploy/tasks/main.yml
+++ b/ansible/roles/turboquant-deploy/tasks/main.yml
@@ -0,0 +1,17 @@
+---
+# Main entry point — common setup followed by OS-specific tasks
+
+- name: Ensure install directory exists (common)
+  file:
+    path: "{{ turboquant_install_dir }}"
+    state: directory
+    mode: '0755'
+  tags: [turboquant, prereq]
+
+- name: Include OS-specific tasks
+  include_tasks: "{{ ansible_os_family | lower }}.yml"
+  tags: [turboquant, deploy]
+
+- name: Run post-deploy integration tests
+  include_tasks: integration_test.yml
+  tags: [turboquant, test]
--- a/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
+++ b/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
@@ -0,0 +1,25 @@
+---
+# TurboQuant Server Configuration
+# Auto-generated by Ansible — node: {{ ansible_host | default('localhost') }}
+
+server:
+  host: "{{ turboquant_host }}"
+  port: {{ turboquant_port }}
+  model: "{{ turboquant_model }}"
+  model_file: "{{ turboquant_model_file }}"
+  base_url: "http://localhost:{{ turboquant_port }}/v1"
+
+turboquant:
+  enabled: true
+  preset: "{{ node_preset }}"
+  kv_type: "{{ turboquant_kv_type }}"
+  layer_adaptive_mode: {{ turboquant_layer_adaptive }}
+
+performance:
+  max_context: {{ turboquant_context }}
+  threads: {{ ansible_processor_vcpus | default(2) }}
+
+deployment:
+  install_dir: "{{ turboquant_install_dir }}"
+  service_name: "{{ turboquant_service_name }}"
+  node_hardware: "{{ node_hardware }}"
--- a/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
+++ b/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
@@ -0,0 +1,25 @@
+[Unit]
+Description=TurboQuant {{ turboquant_model }} Inference Server
+After=network.target
+
+[Service]
+Type=simple
+User={{ turboquant_user }}
+Group={{ turboquant_user }}
+WorkingDirectory={{ turboquant_install_dir }}
+Environment="TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}"
+ExecStart={{ turboquant_install_dir }}/llama-server \
+    -m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
+    --host {{ turboquant_host }} \
+    --port {{ turboquant_port }} \
+    -c {{ turboquant_context }} \
+    -ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }} \
+    --threads {{ ansible_processor_vcpus | default(2) }}
+
+Restart=always
+RestartSec=5
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=multi-user.target