feat: add ansible/README.md for TurboQuant deployment

feat: add ansible/health_check.sh for TurboQuant deployment
feat: add ansible/inventory.ini.example for TurboQuant deployment
2026-04-16 01:51:23 +00:00 · 2026-04-16 01:51:21 +00:00 · 2026-04-16 01:51:19 +00:00 · 2026-04-16 01:51:06 +00:00 · 2026-04-16 01:51:02 +00:00 · 2026-04-16 01:50:59 +00:00
10 changed files with 318 additions and 0 deletions
--- a/ansible/README.md
+++ b/ansible/README.md
@@ -0,0 +1,47 @@
 # TurboQuant Ansible Deployment
 Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
 ## Quick Start
 ```bash
 # 1. Copy and edit inventory
 cp inventory.ini.example inventory.ini
 vim inventory.ini
 # 2. Deploy to all nodes
 ansible-playbook -i inventory.ini deploy_turboquant.yml
 # 3. Deploy without integration tests
 ansible-playbook -i inventory.ini deploy_turboquant.yml -e run_integration_tests=false
 # 4. Deploy to specific node
 ansible-playbook -i inventory.ini deploy_turboquant.yml --limit timmy
 ```
 ## Deployment Matrix
 | Node | Hardware | Model | Preset |
 |------|----------|-------|--------|
 | Mac (Timmy) | M1, 16GB | gemma-4-26B-A4B | turboquant_k8v4 |
 | Allegro VPS | 2 cores, 8GB | gemma-4-E4B | GGUF q4_0 |
 ## Health Check
 ```bash
 # Check local node
 ./health_check.sh localhost 8081
 # Check remote node
 ./health_check.sh 192.168.1.100 8081
 ```
 ## Role Variables
 See `roles/turboquant-deploy/defaults/main.yml` for all configurable variables.
 Key variables:
 - `llama_cpp_port`: Server port (default: 8081)
 - `turboquant_kv_type`: KV cache compression type (default: turbo4)
 - `max_context_tokens`: Maximum context length (default: 131072)
 - `gemma4_model_filename`: Model filename per node
--- a/ansible/deploy_turboquant.yml
+++ b/ansible/deploy_turboquant.yml
@@ -0,0 +1,19 @@
 ---
 # Deploy TurboQuant-compressed Gemma 4 across fleet nodes
 # Usage: ansible-playbook -i inventory.ini ansible/deploy_turboquant.yml
 - name: Deploy TurboQuant Gemma 4
  hosts: turboquant_fleet
  become: yes
  vars:
    turboquant_version: "main"
    model_base_path: "/opt/models"
    llama_cpp_port: 8081
  roles:
    - turboquant-deploy
  post_tasks:
    - name: Run integration tests
      include_tasks: tasks/integration_test.yml
      when: run_integration_tests | default(true)
--- a/ansible/health_check.sh
+++ b/ansible/health_check.sh
@@ -0,0 +1,38 @@
 #!/bin/bash
 # TurboQuant Health Check Script
 # Usage: ./health_check.sh [host] [port]
 HOST=${1:-localhost}
 PORT=${2:-8081}
 TIMEOUT=5
 echo "=== TurboQuant Health Check ==="
 echo "Host: $HOST:$PORT"
 # Check if server is responding
 if ! curl -s --max-time $TIMEOUT "http://$HOST:$PORT/v1/models" > /dev/null 2>&1; then
    echo "ERROR: Server not responding at $HOST:$PORT"
    exit 1
 fi
 # Get model info
 MODELS=$(curl -s "http://$HOST:$PORT/v1/models" | jq -r '.data[].id' 2>/dev/null)
 if [ -z "$MODELS" ]; then
    echo "WARNING: No models loaded"
 else
    echo "Models loaded: $MODELS"
 fi
 # Test inference
 RESPONSE=$(curl -s --max-time 30 "http://$HOST:$PORT/v1/chat/completions"   -H "Content-Type: application/json"   -d '{"model":"gemma-4","messages":[{"role":"user","content":"Say hello"}],"max_tokens":10}')
 if echo "$RESPONSE" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
    echo "Inference: OK"
    echo "Response: $(echo "$RESPONSE" | jq -r '.choices[0].message.content')"
 else
    echo "ERROR: Inference failed"
    echo "Response: $RESPONSE"
    exit 1
 fi
 echo "=== Health Check Passed ==="
--- a/ansible/inventory.ini.example
+++ b/ansible/inventory.ini.example
@@ -0,0 +1,33 @@
 # TurboQuant Fleet Inventory
 # Copy to inventory.ini and update with actual hosts
 [turboquant_fleet]
 # Mac (Timmy) - M1, 16GB
 timmy ansible_host=192.168.1.100 ansible_user=apayne
 # Allegro VPS - 2 cores, 8GB
 allegro ansible_host=167.99.126.228 ansible_user=root
 # Ezra VPS - TBD
 # ezra ansible_host=TBD ansible_user=root
 [turboquant_fleet:vars]
 ansible_python_interpreter=/usr/bin/python3
 # Per-host configuration
 [turboquant_fleet:vars]
 model_base_path=/opt/models
 llama_cpp_port=8081
 turboquant_kv_type=turbo4
 turboquant_layer_mode=7
 max_context_tokens=131072
 # Node-specific overrides
 [timmy:vars]
 gemma4_model_filename=gemma-4-26B-A4B-q4_k_m.gguf
 max_context_tokens=131072
 [allegro:vars]
 gemma4_model_filename=gemma-4-E4B-q4_0.gguf
 max_context_tokens=32768
 turboquant_kv_type=turbo2
--- a/ansible/roles/turboquant-deploy/tasks/darwin.yml
+++ b/ansible/roles/turboquant-deploy/tasks/darwin.yml
@@ -0,0 +1,19 @@
 ---
 # macOS-specific tasks
 - name: Install Homebrew dependencies
  homebrew:
    name:
      - cmake
      - git
      - python@3.11
    state: present
 - name: Check for Metal support
  command: system_profiler SPDisplaysDataType
  register: gpu_info
  changed_when: false
 - name: Set GPU type
  set_fact:
    gpu_type: "{{ 'apple' if 'Metal' in gpu_info.stdout else 'none' }}"
--- a/ansible/roles/turboquant-deploy/tasks/debian.yml
+++ b/ansible/roles/turboquant-deploy/tasks/debian.yml
@@ -0,0 +1,23 @@
 ---
 # Debian/Ubuntu-specific tasks
 - name: Install dependencies
  apt:
    name:
      - build-essential
      - cmake
      - git
      - python3
      - python3-pip
    state: present
    update_cache: yes
 - name: Check for NVIDIA GPU
  shell: lspci | grep -i nvidia
  register: nvidia_check
  failed_when: false
  changed_when: false
 - name: Set GPU type
  set_fact:
    gpu_type: "{{ 'nvidia' if nvidia_check.rc == 0 else 'none' }}"
--- a/ansible/roles/turboquant-deploy/tasks/integration_test.yml
+++ b/ansible/roles/turboquant-deploy/tasks/integration_test.yml
@@ -0,0 +1,38 @@
 ---
 # Post-deploy integration tests
 - name: Wait for TurboQuant service to be ready
  uri:
    url: "http://localhost:{{ llama_cpp_port }}/v1/models"
    method: GET
    status_code: 200
  register: model_check
  retries: 30
  delay: 5
  until: model_check.status == 200
 - name: Test inference with tool call
  uri:
    url: "http://localhost:{{ llama_cpp_port }}/v1/chat/completions"
    method: POST
    body_format: json
    body:
      model: "gemma-4"
      messages:
        - role: "user"
          content: "Say 'test passed' and nothing else."
      max_tokens: 20
    status_code: 200
  register: inference_test
 - name: Verify inference response
  assert:
    that:
      - "'choices' in inference_test.json"
      - "inference_test.json.choices | length > 0"
    fail_msg: "Inference test failed - no valid response"
    success_msg: "Inference test passed"
 - name: Report test results
  debug:
    msg: "Integration test passed on {{ inventory_hostname }}"
--- a/ansible/roles/turboquant-deploy/tasks/main.yml
+++ b/ansible/roles/turboquant-deploy/tasks/main.yml
@@ -0,0 +1,58 @@
 ---
 # Main tasks for TurboQuant deployment
 - name: Gather OS facts
  setup:
    filter: ansible_distribution*
 - name: Include OS-specific tasks
  include_tasks: "{{ ansible_os_family | lower }}.yml"
  when: ansible_os_family in ['Debian', 'RedHat', 'Darwin']
 - name: Create model directory
  file:
    path: "{{ model_base_path }}/gemma4-turboquant"
    state: directory
    mode: '0755'
 - name: Clone llama.cpp TurboQuant fork
  git:
    repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/llama-cpp-turboquant.git"
    dest: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
    version: "{{ turboquant_version }}"
  register: turboquant_clone
 - name: Build llama.cpp with TurboQuant
  shell: |
    cmake -B build -DGGML_METAL={{ 'ON' if ansible_architecture == 'arm64' else 'OFF' }}           -DGGML_CUDA={{ 'ON' if gpu_type == 'nvidia' else 'OFF' }}           -DCMAKE_BUILD_TYPE=Release
    cmake --build build -j{{ ansible_processor_vcpus }}
  args:
    chdir: "{{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}"
  when: turboquant_clone.changed or force_rebuild | default(false)
 - name: Download Gemma 4 model
  get_url:
    url: "{{ gemma4_model_url }}"
    dest: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename }}"
    mode: '0644'
  when: gemma4_model_url is defined
 - name: Deploy TurboQuant server config
  template:
    src: server_config.yml.j2
    dest: "{{ model_base_path }}/gemma4-turboquant/server_config.yml"
    mode: '0644'
 - name: Deploy systemd service
  template:
    src: turboquant.service.j2
    dest: /etc/systemd/system/turboquant.service
    mode: '0644'
  notify: restart turboquant
 - name: Enable and start TurboQuant service
  systemd:
    name: turboquant
    enabled: yes
    state: started
    daemon_reload: yes
--- a/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
+++ b/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
@@ -0,0 +1,24 @@
 # TurboQuant Server Configuration
 # Generated by Ansible for {{ inventory_hostname }}
 server:
  host: "0.0.0.0"
  port: {{ llama_cpp_port }}
  model_path: "{{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}"
 turboquant:
  enabled: true
  kv_type: "{{ turboquant_kv_type | default('turbo4') }}"
  layer_adaptive_mode: {{ turboquant_layer_mode | default(7) }}
 context:
  max_tokens: {{ max_context_tokens | default(131072) }}
  batch_size: {{ batch_size | default(512) }}
 generation:
  temperature: {{ temperature | default(0.7) }}
  top_p: {{ top_p | default(0.9) }}
  top_k: {{ top_k | default(40) }}
 environment:
  TURBO_LAYER_ADAPTIVE: "{{ turboquant_layer_mode | default(7) }}"
--- a/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
+++ b/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
@@ -0,0 +1,19 @@
 [Unit]
 Description=TurboQuant Gemma 4 Inference Server
 After=network.target
 [Service]
 Type=simple
 User={{ turboquant_user | default('root') }}
 WorkingDirectory={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}
 Environment=TURBO_LAYER_ADAPTIVE={{ turboquant_layer_mode | default(7) }}
 {% if ansible_architecture == 'arm64' %}
 Environment=GGML_METAL_DEBUG=0
 Environment=OMP_NUM_THREADS={{ ansible_processor_vcpus }}
 {% endif %}
 ExecStart={{ turboquant_install_path | default('/opt/llama-cpp-turboquant') }}/build/bin/llama-server   -m {{ model_base_path }}/gemma4-turboquant/{{ gemma4_model_filename | default('gemma-4-q4_k_m.gguf') }}   --port {{ llama_cpp_port }}   -ctk {{ turboquant_kv_type | default('turbo4') }}   -ctv {{ turboquant_kv_type | default('turbo4') }}   -c {{ max_context_tokens | default(131072) }}   --host 0.0.0.0
 Restart=always
 RestartSec=10
 [Install]
 WantedBy=multi-user.target
Author	SHA1	Message	Date
Alexander Whitestone	bf68627ea1	feat: add ansible/README.md for TurboQuant deployment All checks were successful Smoke Test / smoke (pull_request) Successful in 15s Details	2026-04-16 01:51:23 +00:00
Alexander Whitestone	dbbed6790f	feat: add ansible/health_check.sh for TurboQuant deployment	2026-04-16 01:51:21 +00:00
Alexander Whitestone	3e304db72a	feat: add ansible/inventory.ini.example for TurboQuant deployment	2026-04-16 01:51:19 +00:00
Alexander Whitestone	0d427b69d3	feat: add ansible/roles/turboquant-deploy/templates/turboquant.service.j2 for TurboQuant deployment	2026-04-16 01:51:06 +00:00
Alexander Whitestone	f1a699a3fc	feat: add ansible/roles/turboquant-deploy/templates/server_config.yml.j2 for TurboQuant deployment	2026-04-16 01:51:02 +00:00
Alexander Whitestone	b9f2da9e19	feat: add ansible/roles/turboquant-deploy/tasks/integration_test.yml for TurboQuant deployment	2026-04-16 01:50:59 +00:00
Alexander Whitestone	5e7c637bbf	feat: add ansible/roles/turboquant-deploy/tasks/debian.yml for TurboQuant deployment	2026-04-16 01:50:53 +00:00
Alexander Whitestone	e46c2b6155	feat: add ansible/roles/turboquant-deploy/tasks/darwin.yml for TurboQuant deployment	2026-04-16 01:50:47 +00:00
Alexander Whitestone	8e9afb34fa	feat: add ansible/roles/turboquant-deploy/tasks/main.yml for TurboQuant deployment	2026-04-16 01:50:42 +00:00
Alexander Whitestone	713db4962e	feat: add ansible/deploy_turboquant.yml for TurboQuant deployment	2026-04-16 01:50:35 +00:00