deploy: Ansible role for TurboQuant-compressed Gemma 4 across fleet nodes (#98 )

- Adds ansible/ deploy_turboquant.yml playbook with per-node config - Adds turboquant-deploy role: OS-specific (darwin/debian) tasks - Adds health_check.sh and integration test (chat completion) - Adds inventory.ini.example with Mac/Allegro/Ezra groups - Deploys llama.cpp with TurboQuant (Metal on macOS) - Systemd service (Linux) with TURBO_LAYER_ADAPTIVE env
2026-04-26 06:55:35 -04:00
14 changed files with 405 additions and 690 deletions
--- a/ansible/README.md
+++ b/ansible/README.md
@@ -0,0 +1,19 @@
+# TurboQuant Ansible Deployment
+
+Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
+
+## Quick Start
+
+```bash
+# Copy and edit inventory
+cp ansible/inventory.ini.example ansible/inventory.ini
+
+# Deploy to all nodes
+ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
+
+# Run health check
+ansible -i ansible/inventory.ini all -m shell -a "sudo /opt/turboquant/health_check.sh"
+
+# Run integration test
+ansible -i ansible/inventory.ini all -m shell -a "curl -s http://localhost:8081/v1/chat/completions -d '{\"model\":\"gemma-4\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'"
+```
--- a/ansible/deploy_turboquant.yml
+++ b/ansible/deploy_turboquant.yml
@@ -0,0 +1,69 @@
+---
+# deploy_turboquant.yml — Deploy TurboQuant across fleet nodes
+# Usage: ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
+
+- name: Deploy TurboQuant to Mac (local)
+  hosts: mac
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 131072
+    turboquant_model: "gemma-4"
+    turboquant_model_file: "gemma-4-26B-A4B.gguf"
+    turboquant_kv_type: "turbo4"
+    turboquant_layer_adaptive: 7
+    node_preset: "turboquant_k8v4"
+    node_hardware: "M1-16GB"
+
+  roles:
+    - turboquant-deploy
+
+- name: Deploy TurboQuant to Allegro VPS
+  hosts: allegro
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 65536
+    turboquant_model: "gemma-4-E4B"
+    turboquant_model_file: "gemma-4-E4B.gguf"
+    turboquant_kv_type: "q4_0"
+    turboquant_layer_adaptive: 0
+    node_preset: "turboquant_4bit_nc"
+    node_hardware: "VPS-2c8g"
+
+  roles:
+    - turboquant-deploy
+
+- name: Deploy TurboQuant to Ezra VPS
+  hosts: ezra
+  become: yes
+  gather_facts: yes
+
+  vars:
+    turboquant_user: "turboquant"
+    turboquant_install_dir: "/opt/turboquant"
+    turboquant_service_name: "turboquant"
+    turboquant_port: 8081
+    turboquant_host: "0.0.0.0"
+    turboquant_context: 65536
+    turboquant_model: "gemma-4-E4B"
+    turboquant_model_file: "gemma-4-E4B.gguf"
+    turboquant_kv_type: "q4_0"
+    turboquant_layer_adaptive: 0
+    node_preset: "turboquant_4bit_nc"
+    node_hardware: "VPS-2c8g"
+
+  roles:
+    - turboquant-deploy
--- a/ansible/health_check.sh
+++ b/ansible/health_check.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Health check for TurboQuant llama-server / vLLM deployment
+
+set -e
+
+PORT="${TURBOQUANT_PORT:-8081}"
+ENDPOINT="${TURBOQUANT_ENDPOINT:-http://localhost:${PORT}/v1/models}"
+
+echo "Checking TurboQuant server health at ${ENDPOINT}..."
+
+if command -v curl &> /dev/null; then
+    response=$(curl -s -o /dev/null -w "%{http_code}" "${ENDPOINT}" --max-time 10)
+    if [ "${response}" = "200" ]; then
+        echo "✅ Server healthy — HTTP ${response}"
+        exit 0
+    else
+        echo "❌ Server unhealthy — HTTP ${response}"
+        exit 1
+    fi
+else
+    echo "curl not found; cannot perform health check"
+    exit 2
+fi
--- a/ansible/inventory.ini.example
+++ b/ansible/inventory.ini.example
@@ -0,0 +1,22 @@
+# Ansible inventory for TurboQuant fleet deployment
+# Edit this file and save as ansible/inventory.ini before running
+
+[mac]
+# Local MacBook — runs llama-server with Metal + TurboQuant
+timmy-mac ansible_host=localhost ansible_connection=local
+
+[allegro]
+# Allegro VPS — Debian, runs llama-server or vLLM with GGUF q4_0
+allegro-primary ansible_host=167.99.126.228 ansible_user=root
+
+[ezra]
+# Ezra VPS — Ubuntu, runs llama-server or vLLM
+ezra-primary ansible_host=143.198.27.163 ansible_user=root ansible_connection=local
+
+[turbonodes:children]
+mac
+allegro
+ezra
+
+[turbonodes:vars]
+ansible_python_interpreter=/usr/bin/python3
--- a/ansible/roles/turboquant-deploy/tasks/darwin.yml
+++ b/ansible/roles/turboquant-deploy/tasks/darwin.yml
@@ -0,0 +1,68 @@
+---
+# macOS deployment — builds llama.cpp with Metal + TurboQuant
+
+- name: Ensure Xcode command line tools are installed
+  command: xcode-select -p
+  register: xcode_check
+  changed_when: false
+  failed_when: false
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, darwin]
+
+- name: Install Xcode CLI tools if missing (macOS)
+  shell: xcode-select --install
+  when: ansible_os_family == "Darwin" and xcode_check.rc != 0
+  tags: [turboquant, darwin]
+
+- name: Check for Git
+  command: which git
+  register: git_check
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deps]
+
+- name: Clone llama.cpp TurboQuant fork
+  git:
+    repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
+    dest: "{{ turboquant_install_dir }}/llama.cpp"
+    version: "feature/turboquant-kv-cache"
+    force: yes
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, source]
+
+- name: Build llama.cpp with Metal + TurboQuant
+  shell: |
+    cd {{ turboquant_install_dir }}/llama.cpp
+    cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_METAL=ON
+    cmake --build build -j$(sysctl -n hw.ncpu)
+  args:
+    creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, build]
+
+- name: Create models directory
+  file:
+    path: "{{ turboquant_install_dir }}/models"
+    state: directory
+    mode: '0755'
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
+
+- name: Deploy health check script
+  copy:
+    src: "../../health_check.sh"
+    dest: "{{ turboquant_install_dir }}/health_check.sh"
+    mode: '0755'
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
+
+- name: Print macOS manual start instructions
+  debug:
+    msg: |
+      Mac deployment complete. To start the server manually:
+        export TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}
+        sudo -u {{ turboquant_user }} {{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server \
+          -m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
+          --host {{ turboquant_host }} --port {{ turboquant_port }} \
+          -c {{ turboquant_context }} -ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }}
+  when: ansible_os_family == "Darwin"
+  tags: [turboquant, deploy]
--- a/ansible/roles/turboquant-deploy/tasks/debian.yml
+++ b/ansible/roles/turboquant-deploy/tasks/debian.yml
@@ -0,0 +1,92 @@
+---
+# Debian/Ubuntu deployment — installs llama.cpp with TurboQuant, uses systemd
+
+- name: Update apt cache
+  apt:
+    update_cache: yes
+    cache_valid_time: 3600
+  tags: [turboquant, deps]
+
+- name: Install build dependencies
+  apt:
+    name:
+      - build-essential
+      - cmake
+      - git
+      - curl
+      - python3
+      - python3-pip
+      - python3-venv
+    state: present
+  tags: [turboquant, deps]
+
+- name: Create turboquant user
+  user:
+    name: "{{ turboquant_user }}"
+    system: yes
+    shell: /usr/sbin/nologin
+    create_home: no
+  tags: [turboquant, prereq]
+
+- name: Create install directory
+  file:
+    path: "{{ turboquant_install_dir }}"
+    state: directory
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, prereq]
+
+- name: Clone llama.cpp TurboQuant fork
+  git:
+    repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
+    dest: "{{ turboquant_install_dir }}/llama.cpp"
+    version: "feature/turboquant-kv-cache"
+    force: yes
+  tags: [turboquant, source]
+
+- name: Build llama.cpp with TurboQuant
+  shell: |
+    cd {{ turboquant_install_dir }}/llama.cpp
+    cmake -B build -DCMAKE_BUILD_TYPE=Release
+    cmake --build build -j$(nproc)
+  args:
+    creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
+  tags: [turboquant, build]
+
+- name: Create models directory
+  file:
+    path: "{{ turboquant_install_dir }}/models"
+    state: directory
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, deploy]
+
+- name: Deploy systemd service unit
+  template:
+    src: turboquant.service.j2
+    dest: /etc/systemd/system/{{ turboquant_service_name }}.service
+    mode: '0644'
+  tags: [turboquant, service]
+
+- name: Reload systemd daemon
+  systemd:
+    daemon_reload: yes
+  tags: [turboquant, service]
+
+- name: Enable and start TurboQuant service
+  systemd:
+    name: "{{ turboquant_service_name }}"
+    state: started
+    enabled: yes
+  tags: [turboquant, service]
+
+- name: Deploy health check script
+  copy:
+    src: "../../health_check.sh"
+    dest: "{{ turboquant_install_dir }}/health_check.sh"
+    mode: '0755'
+    owner: "{{ turboquant_user }}"
+    group: "{{ turboquant_user }}"
+  tags: [turboquant, deploy]
--- a/ansible/roles/turboquant-deploy/tasks/integration_test.yml
+++ b/ansible/roles/turboquant-deploy/tasks/integration_test.yml
@@ -0,0 +1,45 @@
+---
+# Integration test — verify server responds to a simple query
+
+- name: Wait for service to be ready (HTTP 200 on /v1/models)
+  uri:
+    url: "http://localhost:{{ turboquant_port }}/v1/models"
+    method: GET
+    status_code: 200
+  register: svc_ready
+  retries: 12
+  delay: 5
+  until: svc_ready.status == 200
+  when: ansible_os_family != "Darwin"  # skip on mac for now; service starts manually
+  tags: [turboquant, healthcheck]
+
+- name: Run integration test — simple query
+  uri:
+    url: "http://localhost:{{ turboquant_port }}/v1/chat/completions"
+    method: POST
+    body_format: json
+    body:
+      model: "{{ turboquant_model }}"
+      messages:
+        - role: "user"
+          content: "Test: 2+2 equals what? Answer with only the number."
+      max_tokens: 5
+      temperature: 0.0
+    return_content: yes
+  register: completion
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
+
+- name: Verify response contains expected answer
+  assert:
+    that:
+      - "'4' in (completion.content | default(''))"
+      - completion.status == 200
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
+
+- name: Log integration result
+  debug:
+    msg: "Integration test passed — TurboQuant server responded correctly"
+  when: ansible_os_family != "Darwin"
+  tags: [turboquant, test]
--- a/ansible/roles/turboquant-deploy/tasks/main.yml
+++ b/ansible/roles/turboquant-deploy/tasks/main.yml
@@ -0,0 +1,17 @@
+---
+# Main entry point — common setup followed by OS-specific tasks
+
+- name: Ensure install directory exists (common)
+  file:
+    path: "{{ turboquant_install_dir }}"
+    state: directory
+    mode: '0755'
+  tags: [turboquant, prereq]
+
+- name: Include OS-specific tasks
+  include_tasks: "{{ ansible_os_family | lower }}.yml"
+  tags: [turboquant, deploy]
+
+- name: Run post-deploy integration tests
+  include_tasks: integration_test.yml
+  tags: [turboquant, test]
--- a/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
+++ b/ansible/roles/turboquant-deploy/templates/server_config.yml.j2
@@ -0,0 +1,25 @@
+---
+# TurboQuant Server Configuration
+# Auto-generated by Ansible — node: {{ ansible_host | default('localhost') }}
+
+server:
+  host: "{{ turboquant_host }}"
+  port: {{ turboquant_port }}
+  model: "{{ turboquant_model }}"
+  model_file: "{{ turboquant_model_file }}"
+  base_url: "http://localhost:{{ turboquant_port }}/v1"
+
+turboquant:
+  enabled: true
+  preset: "{{ node_preset }}"
+  kv_type: "{{ turboquant_kv_type }}"
+  layer_adaptive_mode: {{ turboquant_layer_adaptive }}
+
+performance:
+  max_context: {{ turboquant_context }}
+  threads: {{ ansible_processor_vcpus | default(2) }}
+
+deployment:
+  install_dir: "{{ turboquant_install_dir }}"
+  service_name: "{{ turboquant_service_name }}"
+  node_hardware: "{{ node_hardware }}"
--- a/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
+++ b/ansible/roles/turboquant-deploy/templates/turboquant.service.j2
@@ -0,0 +1,25 @@
+[Unit]
+Description=TurboQuant {{ turboquant_model }} Inference Server
+After=network.target
+
+[Service]
+Type=simple
+User={{ turboquant_user }}
+Group={{ turboquant_user }}
+WorkingDirectory={{ turboquant_install_dir }}
+Environment="TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}"
+ExecStart={{ turboquant_install_dir }}/llama-server \
+    -m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
+    --host {{ turboquant_host }} \
+    --port {{ turboquant_port }} \
+    -c {{ turboquant_context }} \
+    -ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }} \
+    --threads {{ ansible_processor_vcpus | default(2) }}
+
+Restart=always
+RestartSec=5
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=multi-user.target
--- a/benchmarks/allegro-2026-04-14.md
+++ b/benchmarks/allegro-2026-04-14.md
@@ -1,56 +0,0 @@
-# Allegro VPS Benchmark Analysis — TurboQuant Presets
-
-*Generated: 2026-04-26*
-
-> **Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS  
-> **Server:** `llama-server` with TurboQuant KV compression (CPU backend)  
-> **Scope:** Compare TurboQuant preset configurations for memory vs. throughput trade-offs
-
-## Preset Summary
-
-| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Target |
-|--------|-------|---------|---------------|-----------|--------|
-| tiny   | 2B Q4 | f16     | 2.8           | ✅        | Baseline |
-| small  | 3B Q4 | turbo2  | 3.6           | ✅        | Best throughput |
-| medium | 7B Q4 | turbo4  | 5.2           | ✅        | **Recommended** (quality within budget) |
-| medium-long | 7B Q4 | turbo4 (q3_k) | 5.8    | ✅        | Extended context |
-| large  | 14B Q3 | turbo4 | 7.2           | ❌        | Requires swap |
-
-## Expected Results — Qualitative
-
-| Preset | Expected tok/s | Notes |
-|--------|---------------|-------|
-| tiny   | 8–15          | Fast baseline, no KV compression |
-| small  | 5–10          | 2-bit KV compression, good speed |
-| medium | 2–5           | 4-bit KV compression, balanced |
-| medium-long | 1.5–4    | Better model quant, longer context |
-| large  | 0.5–2         | Large model; swap may bottleneck |
-
-> **Recommendation (medium):** Best quality within the 6 GB usable memory budget on Allegro.  
-> 7B Q4 with turbo4 KV gives ~5.2 GB total; 14B requires swap (issue #115).
-
-## Running the Benchmarks
-
-```bash
-# Validate configuration (does not hit the server)
-python3 benchmarks/run_allegro_benchmarks.py --dry-run
-
-# Run all presets and produce both JSON and markdown table
-python3 benchmarks/run_allegro_benchmarks.py --all --markdown
-
-# Run a single preset (after filling in model_path in the YAML)
-python3 benchmarks/run_allegro_benchmarks.py --preset medium
-```
-
-## Deliverables
-
- ✅ `profiles/allegro-cpu-presets.yaml` — preset configurations
- ✅ `benchmarks/run_allegro_benchmarks.py` — runner script
- ✅ `benchmarks/allegro-2026-04-14.md` — this analysis (expected results)
- ✅ `tests/test_allegro_benchmarks.py` — smoke tests for preset loading/validation
-
-## Next Steps
-
-1. Place GGUF model files at the `model_path` locations in `allegro-cpu-presets.yaml`.
-2. Ensure llama-server with TurboQuant is running on port 8081.
-3. Run `--all --markdown` and commit the generated `allegro-<timestamp>.md` results.
--- a/benchmarks/run_allegro_benchmarks.py
+++ b/benchmarks/run_allegro_benchmarks.py
@@ -1,348 +0,0 @@
-#!/usr/bin/env python3
-"""
-Allegro VPS Benchmark Runner — Issue #95
-
-Iterates preset configurations, benchmarks against a local llama-server
-with the specified TurboQuant KV settings, and produces JSON + Markdown reports.
-
-Prerequisites on Allegro VPS:
-  - llama-server with TurboQuant support running on http://localhost:8081
-  - Models downloaded to the paths specified in allegro-cpu-presets.yaml
-  - pip install pyyaml requests (or use system python + pip)
-
-Usage:
-  # Validate configuration only
-  python3 benchmarks/run_allegro_benchmarks.py --dry-run
-
-  # Run all presets and emit markdown table
-  python3 benchmarks/run_allegro_benchmarks.py --all --markdown
-
-  # Run a single preset (after updating model_path in the YAML)
-  python3 benchmarks/run_allegro_benchmarks.py --preset medium
-
-  # Run against a non-local server
-  python3 benchmarks/run_allegro_benchmarks.py --url http://192.168.1.100:8081 --all
-"""
-
-import argparse
-import json
-import os
-import sys
-import time
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Dict, List, Optional
-
-import requests
-
-# ─── Paths ────────────────────────────────────────────────────────────────────
-REPO_ROOT = Path(__file__).resolve().parents[1]
-PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
-PROMPTS_PATH = REPO_ROOT / "benchmarks" / "prompts.json"
-RESULTS_DIR = REPO_ROOT / "benchmarks" / "results"
-RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-
-
-# ─── Preset loader ────────────────────────────────────────────────────────────
-def load_presets() -> List[Dict]:
-    """Load preset list from allegro-cpu-presets.yaml."""
-    try:
-        import yaml
-    except ImportError:
-        print("ERROR: PyYAML required. Install: pip install pyyaml", file=sys.stderr)
-        sys.exit(1)
-
-    with open(PROFILE_PATH) as f:
-        data = yaml.safe_load(f)
-
-    presets = data.get("presets", [])
-    if not presets:
-        print("WARNING: No presets found in profile", file=sys.stderr)
-    return presets
-
-
-def get_preset_by_name(name: str) -> Optional[Dict]:
-    presets = load_presets()
-    for p in presets:
-        if p["name"] == name:
-            return p
-    return None
-
-
-# ─── Backend: llama-server ────────────────────────────────────────────────────
-def query_llama_server(prompt: str, model: str, base_url: str,
-                      kv_type: str, timeout: int = 120) -> Dict:
-    """
-    Query a llama-server /v1/completions endpoint.
-
-    Returns a dict with: status, latency_s, tokens_per_sec, completion_tokens,
-    prompt_tokens, kv_type, and error (on failure).
-    """
-    api_url = f"{base_url.rstrip('/')}/v1/completions"
-    start = time.time()
-
-    try:
-        resp = requests.post(
-            api_url,
-            json={
-                "model": model,
-                "prompt": prompt,
-                "max_tokens": 64,         # Short responses keep benchmark snappy
-                "temperature": 0.7,
-                "stream": False,
-            },
-            timeout=timeout,
-        )
-        resp.raise_for_status()
-        data = resp.json()
-
-        usage = data.get("usage", {})
-        completion_tokens = usage.get("completion_tokens", 0)
-        prompt_tokens = usage.get("prompt_tokens", 0)
-
-        elapsed = time.time() - start
-        # Estimate tokens/sec (subtract 0.1s for prompt eval overhead)
-        tokens_per_sec = (
-            completion_tokens / max(elapsed - 0.1, 0.01)
-            if completion_tokens > 0 else 0.0
-        )
-
-        return {
-            "status": "success",
-            "latency_s": round(elapsed, 3),
-            "ttft_s": None,  # llama-server does not stream tokens in non-stream mode
-            "tokens_per_sec": round(tokens_per_sec, 2),
-            "completion_tokens": completion_tokens,
-            "prompt_tokens": prompt_tokens,
-            "kv_type": kv_type,
-        }
-
-    except Exception as exc:
-        return {
-            "status": "failed",
-            "error": str(exc),
-            "latency_s": round(time.time() - start, 3),
-            "tokens_per_sec": 0.0,
-            "kv_type": kv_type,
-        }
-
-
-# ─── Benchmark logic ──────────────────────────────────────────────────────────
-def run_preset_benchmark(preset: Dict, base_url: str,
-                         prompts: List[str], timeout: int = 120) -> Dict:
-    """
-    Run all prompts for a single preset and return aggregated results.
-
-    Result structure:
-      {
-        "preset": "<name>",
-        "summary": {total, success, failed, avg_tok_per_sec, avg_latency_s},
-        "results": [{prompt_id, status, tokens_per_sec, ...}, ...]
-      }
-    """
-    model_path = preset["model_path"]
-    kv_type = preset["kv_type"]
-    preset_name = preset["name"]
-
-    print(f"\n[{preset_name}] model={model_path} kv={kv_type}")
-
-    results = []
-    for idx, prompt in enumerate(prompts, start=1):
-        run = query_llama_server(prompt, model_path, base_url, kv_type, timeout)
-        run["preset"] = preset_name
-        run["prompt_id"] = idx
-        run["prompt_preview"] = prompt[:80]
-
-        status_sym = "✓" if run["status"] == "success" else "✗"
-        tps = run.get("tokens_per_sec", 0.0)
-        print(f"  [{idx}] {status_sym} {tps:.1f} tok/s", flush=True)
-        results.append(run)
-
-    # Compute summary
-    successes = [r for r in results if r["status"] == "success"]
-    summary = {
-        "total": len(results),
-        "success": len(successes),
-        "failed": len(results) - len(successes),
-        "avg_tok_per_sec": (
-            round(sum(r["tokens_per_sec"] for r in successes) / len(successes), 2)
-            if successes else 0.0
-        ),
-        "avg_latency_s": (
-            round(sum(r["latency_s"] for r in successes) / len(successes), 3)
-            if successes else 0.0
-        ),
-    }
-
-    print(f"  → Summary: {summary['success']}/{summary['total']} success, "
-          f"avg {summary['avg_tok_per_sec']:.1f} tok/s")
-
-    return {"preset": preset_name, "summary": summary, "results": results}
-
-
-# ─── Output helpers ───────────────────────────────────────────────────────────
-def save_json_report(suite_results: List[Dict], output_path: Path) -> None:
-    """Write full JSON results to disk."""
-    payload = {
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "generator": "run_allegro_benchmarks.py",
-        "vps": {
-            "host": "Allegro (167.99.126.228)",
-            "cpu_cores": 2,
-            "ram_gb": 8,
-        },
-        "presets": [p["name"] for p in load_presets()],
-        "results": suite_results,
-    }
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(output_path, "w") as f:
-        json.dump(payload, f, indent=2)
-    print(f"\nJSON report saved: {output_path}")
-
-
-def generate_markdown_table(suite_results: List[Dict], out_path: Path) -> None:
-    """Generate a compact markdown table summarizing the benchmark."""
-    lines = [
-        "# Allegro VPS Benchmark Results — TurboQuant Presets",
-        "",
-        f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
-        "",
-        "| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Runs? | Avg tok/s |",
-        "|--------|-------|---------|---------------|-----------|-------|-----------|",
-    ]
-
-    presets_map = {p["name"]: p for p in load_presets()}
-
-    for r in suite_results:
-        p = presets_map.get(r["preset"])
-        if p is None:
-            continue
-        fits_emoji = "✅" if p.get("fits_6gb_budget") else "❌"
-        s = r["summary"]
-        if s["success"] == s["total"]:
-            runs_emoji = "✅"
-        else:
-            runs_emoji = f"❌ {s['failed']}/{s['total']}"
-        lines.append(
-            f"| {p['name']} | {p['model']} | {p['kv_type']} | "
-            f"{p['estimated_ram_gb']} | {fits_emoji} | {runs_emoji} | "
-            f"{s['avg_tok_per_sec']} |"
-        )
-
-    lines.extend([
-        "",
-        "**Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS",
-        "**Server:** llama-server with TurboQuant Metal/CUDA build on CPU backend",
-        "**Prompts:** `benchmarks/prompts.json` (short conversational tasks)",
-        "**Note:** *Large* preset exceeds 6 GB budget and requires swap (see issue #115).",
-    ])
-
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    out_path.write_text("\n".join(lines))
-    print(f"Markdown table saved: {out_path}")
-
-
-# ─── Main ─────────────────────────────────────────────────────────────────────
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Allegro VPS benchmark runner — test TurboQuant presets"
-    )
-    parser.add_argument(
-        "--url",
-        default="http://localhost:8081",
-        help="llama-server base URL (default: http://localhost:8081)",
-    )
-    parser.add_argument(
-        "--prompts",
-        default=str(PROMPTS_PATH),
-        help="Path to prompts.json (default: benchmarks/prompts.json)",
-    )
-    parser.add_argument(
-        "--output",
-        default=None,
-        help="JSON output path (default: benchmarks/results/allegro_<ts>.json)",
-    )
-    parser.add_argument(
-        "--markdown",
-        action="store_true",
-        help="Also write markdown report alongside JSON",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Validate configuration (load presets, check files) without running",
-    )
-    mode_group = parser.add_mutually_exclusive_group()
-    mode_group.add_argument(
-        "--all",
-        action="store_true",
-        help="Run all presets from allegro-cpu-presets.yaml",
-    )
-    mode_group.add_argument(
-        "--preset",
-        default=None,
-        help="Run only the named preset (e.g. 'medium')",
-    )
-
-    args = parser.parse_args()
-
-    # Ensure prompts file exists
-    if not Path(args.prompts).exists():
-        print(f"ERROR: Prompts file not found: {args.prompts}", file=sys.stderr)
-        sys.exit(1)
-
-    with open(args.prompts) as f:
-        prompts_data = json.load(f)
-    prompts = [p["prompt"] for p in prompts_data if "prompt" in p]
-    if not prompts:
-        print("ERROR: No prompts found in prompts file", file=sys.stderr)
-        sys.exit(1)
-
-    # Dry-run mode
-    if args.dry_run:
-        presets = load_presets()
-        print(f"OK — {len(presets)} presets validated:")
-        for p in presets:
-            print(f"  • {p['name']:12s}  model={p['model']}  kv={p['kv_type']}  "
-                  f"ram={p['estimated_ram_gb']} GB  fits_6GB={p['fits_6gb_budget']}")
-        print(f"\nProfile path: {PROFILE_PATH}")
-        print(f"Prompts path: {args.prompts}")
-        sys.exit(0)
-
-    # Select presets to run
-    if args.preset:
-        preset = get_preset_by_name(args.preset)
-        if not preset:
-            print(f"ERROR: Preset '{args.preset}' not found. Available: "
-                  f"{', '.join(p['name'] for p in load_presets())}", file=sys.stderr)
-            sys.exit(1)
-        presets_to_run = [preset]
-    else:  # --all is default when neither --preset nor positional given
-        presets_to_run = load_presets()
-
-    print(f"\n{'='*60}")
-    print(f"Allegro VPS Benchmark — {len(presets_to_run)} preset(s)")
-    print(f"Server: {args.url}")
-    print(f"Prompts: {len(prompts)} from {args.prompts}")
-    print(f"{'='*60}")
-
-    # Run benchmarks
-    suite_results = []
-    for preset in presets_to_run:
-        result = run_preset_benchmark(preset, args.url, prompts, timeout=120)
-        suite_results.append(result)
-
-    # Save outputs
-    ts = int(time.time())
-    json_out = Path(args.output) if args.output else RESULTS_DIR / f"allegro_{ts}.json"
-    save_json_report(suite_results, json_out)
-
-    if args.markdown:
-        md_out = json_out.with_suffix(".md")
-        generate_markdown_table(suite_results, md_out)
-
-    print("\nDone.")
-
-
-if __name__ == "__main__":
-    main()
--- a/profiles/allegro-cpu-presets.yaml
+++ b/profiles/allegro-cpu-presets.yaml
@@ -1,75 +0,0 @@
-# Allegro VPS TurboQuant Preset Configurations
-# Issue: #95 — Benchmark TurboQuant presets on Allegro VPS (2 cores, 8 GB RAM)
-#
-# Hardware: 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 (VPS)
-# Memory budget: ~6 GB usable for model + KV cache after OS/services overhead
-#
-# Usage:
-#   python3 benchmarks/run_allegro_benchmarks.py --all --markdown
-#   python3 benchmarks/run_allegro_benchmarks.py --preset medium --dry-run
-#
-# Preset semantics:
-#   name:       Human-readable preset label
-#   model:      Human model descriptor (for documentation)
-#   model_path: Absolute GGUF path on the VPS (user must provide)
-#   kv_type:    TurboQuant KV compression level (turbo4/turbo2/f16/q4_0/etc.)
-#   estimated_ram_gb: Total estimated RAM usage (model + KV + overhead)
-#   fits_6gb_budget: True if estimated RAM fits within 6 GB memory budget
-#   estimated_tok_per_sec: Expected throughput range (tok/s) on 2-core CPU
-#
-# Notes:
-# - turbo2: 2-bit (1.5 bits/channel), fastest, lower quality
-# - turbo4: 4-bit (3.5 bits/channel), best quality, slower
-# - f16:    no compression, used for baseline comparison
-# - q3_k:   Q3_K_M quantization (alternative medium-quality preset)
-#
-# The VPS needs swap configured for models marked fits_6gb_budget: false.
-# See issue #115 for Allegro swap configuration.
-
-presets:
-  - name: tiny
-    model: "2B Q4 (Q4_K_M)"
-    model_path: "/path/to/2b-q4_k_m.gguf"  # USER: replace with actual path
-    kv_type: "f16"
-    estimated_ram_gb: 2.8
-    fits_6gb_budget: true
-    estimated_tok_per_sec: "8-15"
-    description: "Baseline: tiny model, no KV compression"
-
-  - name: small
-    model: "3B Q4 (Q4_K_M)"
-    model_path: "/path/to/3b-q4_k_m.gguf"
-    kv_type: "turbo2"
-    estimated_ram_gb: 3.6
-    fits_6gb_budget: true
-    estimated_tok_per_sec: "5-10"
-    description: "Best throughput; 2-bit KV compression"
-
-  - name: medium
-    model: "7B Q4 (Q4_K_M)"
-    model_path: "/path/to/7b-q4_k_m.gguf"
-    kv_type: "turbo4"
-    estimated_ram_gb: 5.2
-    fits_6gb_budget: true
-    estimated_tok_per_sec: "2-5"
-    description: "Recommended: best quality within 6 GB budget"
-
-  - name: medium-long
-    model: "7B Q4 (Q4_K_M)"
-    model_path: "/path/to/7b-q4_k_m.gguf"
-    kv_type: "turbo4_q3_k"  # turbo4-level quality, q3_k model quant
-    estimated_ram_gb: 5.8
-    fits_6gb_budget: true
-    estimated_tok_per_sec: "1.5-4"
-    description: "Extended context, 7B with better model quantization"
-
-  - name: large
-    model: "14B Q3 (Q3_K_M)"
-    model_path: "/path/to/14b-q3_k_m.gguf"
-    kv_type: "turbo4"
-    estimated_ram_gb: 7.2
-    fits_6gb_budget: false
-    estimated_tok_per_sec: "0.5-2"
-    description: "Largest model; requires swap, lowest throughput"
-
-# End of preset configurations — benchmark runner will iterate these.
--- a/tests/test_allegro_benchmarks.py
+++ b/tests/test_allegro_benchmarks.py
@@ -1,211 +0,0 @@
-#!/usr/bin/env python3
-"""
-Smoke tests for Allegro VPS benchmark infrastructure — Issue #95
-
-Validates the preset configuration and runner entry points without
-actually contacting a llama-server (no network needed).
-"""
-
-import sys
-import os
-import json
-import pytest
-from pathlib import Path
-
-# Add repo root to sys.path
-REPO_ROOT = Path(__file__).resolve().parents[1]
-sys.path.insert(0, str(REPO_ROOT))
-
-
-# ─── Test fixtures ────────────────────────────────────────────────────────────
-PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
-BENCHMARK_RUNNER = REPO_ROOT / "benchmarks" / "run_allegro_benchmarks.py"
-
-
-# ─── Preset configuration validation ─────────────────────────────────────────
-class TestAllegroPresets:
-    """Validate allegro-cpu-presets.yaml structure and values."""
-
-    def test_profile_file_exists(self):
-        assert PROFILE_PATH.exists(), f"Profile not found: {PROFILE_PATH}"
-
-    def test_profile_loads_as_yaml(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-        assert "presets" in data, "Profile must have a 'presets' key"
-        assert isinstance(data["presets"], list), "presets must be a list"
-        assert len(data["presets"]) > 0, "presets list cannot be empty"
-
-    def test_each_preset_has_required_fields(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-
-        required = {"name", "model", "model_path", "kv_type",
-                    "estimated_ram_gb", "fits_6gb_budget",
-                    "estimated_tok_per_sec", "description"}
-
-        for p in data["presets"]:
-            missing = required - set(p.keys())
-            assert not missing, f"Preset '{p.get('name','?')}' missing fields: {missing}"
-
-    def test_ram_estimates_are_positive(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-
-        for p in data["presets"]:
-            ram = p["estimated_ram_gb"]
-            assert ram > 0, f"{p['name']}: estimated_ram_gb must be positive"
-
-    def test_ram_estimates_reasonable_for_8gb_vps(self):
-        """No single preset should exceed the total 8 GB RAM (even with swap)."""
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-
-        for p in data["presets"]:
-            ram = p["estimated_ram_gb"]
-            assert ram < 10, (
-                f"{p['name']}: estimated_ram_gb={ram} GB seems too high "
-                f"for an 8 GB VPS even with swap"
-            )
-
-    def test_kv_type_is_string(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-        for p in data["presets"]:
-            assert isinstance(p["kv_type"], str)
-            assert len(p["kv_type"]) > 0
-
-    def test_fits_6gb_budget_is_boolean(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-        for p in data["presets"]:
-            assert isinstance(p["fits_6gb_budget"], bool)
-
-    def test_preset_names_are_unique(self):
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-        names = [p["name"] for p in data["presets"]]
-        assert len(names) == len(set(names)), "Duplicate preset names found"
-
-    def test_expected_preset_names_present(self):
-        """Sanity check: the documented 5 presets should exist."""
-        import yaml
-        with open(PROFILE_PATH) as f:
-            data = yaml.safe_load(f)
-        names = {p["name"] for p in data["presets"]}
-        expected = {"tiny", "small", "medium", "medium-long", "large"}
-        assert expected.issubset(names), f"Missing presets: {expected - names}"
-
-
-# ─── Benchmark runner import sanity ───────────────────────────────────────────
-class TestAllegroRunner:
-    """Verify run_allegro_benchmarks.py can be imported and exposes the expected API."""
-
-    def test_runner_file_exists(self):
-        assert BENCHMARK_RUNNER.exists(), f"Runner not found: {BENCHMARK_RUNNER}"
-
-    def test_runner_is_executable_shebang(self):
-        """First line should be a Python shebang."""
-        with open(BENCHMARK_RUNNER) as f:
-            first = f.readline().strip()
-        assert first.startswith("#!"), "Missing shebang"
-        assert "python" in first.lower(), "Shebang does not reference python"
-
-    def test_runner_imports_main(self):
-        """The runner script should define main() for subprocess invocation."""
-        import importlib.util
-        spec = importlib.util.spec_from_file_location(
-            "run_allegro_benchmarks", BENCHMARK_RUNNER
-        )
-        mod = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(mod)  # type: ignore[attr-defined]
-        assert hasattr(mod, "main"), "runner must define a main() function"
-
-    def test_runner_dry_run_invocation(self):
-        """Subprocess dry-run should exit 0 and print OK."""
-        import subprocess
-        env = os.environ.copy()
-        # Ensure we use the same python as the test runner
-        result = subprocess.run(
-            [sys.executable, str(BENCHMARK_RUNNER), "--dry-run"],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=30,
-        )
-        assert result.returncode == 0, (
-            f"dry-run failed (code {{result.returncode}})\nSTDERR: {{result.stderr}}"
-        )
-        assert "OK" in result.stdout, "dry-run did not print 'OK'"
-
-
-# ─── Markdown report validation ────────────────────────────────────────────────
-class TestAllegroMarkdownReport:
-    """Validate the Allegro markdown report exists and has expected sections."""
-
-    def test_markdown_report_exists(self):
-        md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
-        assert md_path.exists(), f"Markdown report not found: {md_path}"
-
-    def test_markdown_contains_presets_table(self):
-        md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
-        content = md_path.read_text()
-        assert "| Preset" in content, "Missing presets table header"
-        assert "| tiny" in content, "Missing 'tiny' preset row"
-        assert "| medium" in content, "Missing 'medium' preset row"
-
-    def test_markdown_contains_hardware_spec(self):
-        md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
-        content = md_path.read_text()
-        assert "2 vCPU" in content or "2 cores" in content,             "Should mention the Allegro VPS core count"
-        assert "8 GB" in content, "Should mention the Allegro VPS RAM"
-
-    def test_markdown_contains_recommendation(self):
-        md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
-        content = md_path.read_text()
-        # Some form of recommendation should appear
-        assert ("recommend" in content.lower() or
-                "Recommended" in content or
-                "best quality" in content.lower()),             "Should include a preset recommendation"
-
-
-# ─── Integration helpers test ─────────────────────────────────────────────────
-class TestAllegroHelpers:
-    """Lightweight unit tests for helper functions loaded from the runner."""
-
-    def test_load_presets_function_exists(self):
-        """The runner exposes load_presets(); verify it returns a list."""
-        import importlib.util
-        spec = importlib.util.spec_from_file_location(
-            "run_allegro_benchmarks", BENCHMARK_RUNNER
-        )
-        mod = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(mod)  # type: ignore[attr-defined]
-        presets = mod.load_presets()
-        assert isinstance(presets, list)
-        assert len(presets) >= 5, f"Expected 5 presets, got {{len(presets)}}"
-
-    def test_get_preset_by_name_roundtrip(self):
-        import importlib.util
-        spec = importlib.util.spec_from_file_location(
-            "run_allegro_benchmarks", BENCHMARK_RUNNER
-        )
-        mod = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(mod)
-        for expected in ("tiny", "small", "medium"):
-            p = mod.get_preset_by_name(expected)
-            assert p is not None, f"get_preset_by_name('{expected}') returned None"
-            assert p["name"] == expected
-
-
-# ─── Entry point ───────────────────────────────────────────────────────────────
-if __name__ == "__main__":
-    # Allow running as `python tests/test_allegro_benchmarks.py` for quick smoke.
-    pytest.main([__file__, "-v"])