Compare commits
1 Commits
step35/95-
...
step35/98-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e20439b544 |
19
ansible/README.md
Normal file
19
ansible/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# TurboQuant Ansible Deployment
|
||||
|
||||
Deploy TurboQuant-compressed Gemma 4 inference across fleet nodes.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Copy and edit inventory
|
||||
cp ansible/inventory.ini.example ansible/inventory.ini
|
||||
|
||||
# Deploy to all nodes
|
||||
ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
|
||||
|
||||
# Run health check
|
||||
ansible -i ansible/inventory.ini all -m shell -a "sudo /opt/turboquant/health_check.sh"
|
||||
|
||||
# Run integration test
|
||||
ansible -i ansible/inventory.ini all -m shell -a "curl -s http://localhost:8081/v1/chat/completions -d '{\"model\":\"gemma-4\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}]}'"
|
||||
```
|
||||
69
ansible/deploy_turboquant.yml
Normal file
69
ansible/deploy_turboquant.yml
Normal file
@@ -0,0 +1,69 @@
|
||||
---
|
||||
# deploy_turboquant.yml — Deploy TurboQuant across fleet nodes
|
||||
# Usage: ansible-playbook -i ansible/inventory.ini ansible/deploy_turboquant.yml
|
||||
|
||||
- name: Deploy TurboQuant to Mac (local)
|
||||
hosts: mac
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
turboquant_user: "turboquant"
|
||||
turboquant_install_dir: "/opt/turboquant"
|
||||
turboquant_service_name: "turboquant"
|
||||
turboquant_port: 8081
|
||||
turboquant_host: "0.0.0.0"
|
||||
turboquant_context: 131072
|
||||
turboquant_model: "gemma-4"
|
||||
turboquant_model_file: "gemma-4-26B-A4B.gguf"
|
||||
turboquant_kv_type: "turbo4"
|
||||
turboquant_layer_adaptive: 7
|
||||
node_preset: "turboquant_k8v4"
|
||||
node_hardware: "M1-16GB"
|
||||
|
||||
roles:
|
||||
- turboquant-deploy
|
||||
|
||||
- name: Deploy TurboQuant to Allegro VPS
|
||||
hosts: allegro
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
turboquant_user: "turboquant"
|
||||
turboquant_install_dir: "/opt/turboquant"
|
||||
turboquant_service_name: "turboquant"
|
||||
turboquant_port: 8081
|
||||
turboquant_host: "0.0.0.0"
|
||||
turboquant_context: 65536
|
||||
turboquant_model: "gemma-4-E4B"
|
||||
turboquant_model_file: "gemma-4-E4B.gguf"
|
||||
turboquant_kv_type: "q4_0"
|
||||
turboquant_layer_adaptive: 0
|
||||
node_preset: "turboquant_4bit_nc"
|
||||
node_hardware: "VPS-2c8g"
|
||||
|
||||
roles:
|
||||
- turboquant-deploy
|
||||
|
||||
- name: Deploy TurboQuant to Ezra VPS
|
||||
hosts: ezra
|
||||
become: yes
|
||||
gather_facts: yes
|
||||
|
||||
vars:
|
||||
turboquant_user: "turboquant"
|
||||
turboquant_install_dir: "/opt/turboquant"
|
||||
turboquant_service_name: "turboquant"
|
||||
turboquant_port: 8081
|
||||
turboquant_host: "0.0.0.0"
|
||||
turboquant_context: 65536
|
||||
turboquant_model: "gemma-4-E4B"
|
||||
turboquant_model_file: "gemma-4-E4B.gguf"
|
||||
turboquant_kv_type: "q4_0"
|
||||
turboquant_layer_adaptive: 0
|
||||
node_preset: "turboquant_4bit_nc"
|
||||
node_hardware: "VPS-2c8g"
|
||||
|
||||
roles:
|
||||
- turboquant-deploy
|
||||
23
ansible/health_check.sh
Executable file
23
ansible/health_check.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# Health check for TurboQuant llama-server / vLLM deployment
|
||||
|
||||
set -e
|
||||
|
||||
PORT="${TURBOQUANT_PORT:-8081}"
|
||||
ENDPOINT="${TURBOQUANT_ENDPOINT:-http://localhost:${PORT}/v1/models}"
|
||||
|
||||
echo "Checking TurboQuant server health at ${ENDPOINT}..."
|
||||
|
||||
if command -v curl &> /dev/null; then
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" "${ENDPOINT}" --max-time 10)
|
||||
if [ "${response}" = "200" ]; then
|
||||
echo "✅ Server healthy — HTTP ${response}"
|
||||
exit 0
|
||||
else
|
||||
echo "❌ Server unhealthy — HTTP ${response}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "curl not found; cannot perform health check"
|
||||
exit 2
|
||||
fi
|
||||
22
ansible/inventory.ini.example
Normal file
22
ansible/inventory.ini.example
Normal file
@@ -0,0 +1,22 @@
|
||||
# Ansible inventory for TurboQuant fleet deployment
|
||||
# Edit this file and save as ansible/inventory.ini before running
|
||||
|
||||
[mac]
|
||||
# Local MacBook — runs llama-server with Metal + TurboQuant
|
||||
timmy-mac ansible_host=localhost ansible_connection=local
|
||||
|
||||
[allegro]
|
||||
# Allegro VPS — Debian, runs llama-server or vLLM with GGUF q4_0
|
||||
allegro-primary ansible_host=167.99.126.228 ansible_user=root
|
||||
|
||||
[ezra]
|
||||
# Ezra VPS — Ubuntu, runs llama-server or vLLM
|
||||
ezra-primary ansible_host=143.198.27.163 ansible_user=root ansible_connection=local
|
||||
|
||||
[turbonodes:children]
|
||||
mac
|
||||
allegro
|
||||
ezra
|
||||
|
||||
[turbonodes:vars]
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
68
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
68
ansible/roles/turboquant-deploy/tasks/darwin.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
---
|
||||
# macOS deployment — builds llama.cpp with Metal + TurboQuant
|
||||
|
||||
- name: Ensure Xcode command line tools are installed
|
||||
command: xcode-select -p
|
||||
register: xcode_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, darwin]
|
||||
|
||||
- name: Install Xcode CLI tools if missing (macOS)
|
||||
shell: xcode-select --install
|
||||
when: ansible_os_family == "Darwin" and xcode_check.rc != 0
|
||||
tags: [turboquant, darwin]
|
||||
|
||||
- name: Check for Git
|
||||
command: which git
|
||||
register: git_check
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, deps]
|
||||
|
||||
- name: Clone llama.cpp TurboQuant fork
|
||||
git:
|
||||
repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
|
||||
dest: "{{ turboquant_install_dir }}/llama.cpp"
|
||||
version: "feature/turboquant-kv-cache"
|
||||
force: yes
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, source]
|
||||
|
||||
- name: Build llama.cpp with Metal + TurboQuant
|
||||
shell: |
|
||||
cd {{ turboquant_install_dir }}/llama.cpp
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_METAL=ON
|
||||
cmake --build build -j$(sysctl -n hw.ncpu)
|
||||
args:
|
||||
creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, build]
|
||||
|
||||
- name: Create models directory
|
||||
file:
|
||||
path: "{{ turboquant_install_dir }}/models"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, deploy]
|
||||
|
||||
- name: Deploy health check script
|
||||
copy:
|
||||
src: "../../health_check.sh"
|
||||
dest: "{{ turboquant_install_dir }}/health_check.sh"
|
||||
mode: '0755'
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, deploy]
|
||||
|
||||
- name: Print macOS manual start instructions
|
||||
debug:
|
||||
msg: |
|
||||
Mac deployment complete. To start the server manually:
|
||||
export TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}
|
||||
sudo -u {{ turboquant_user }} {{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server \
|
||||
-m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
|
||||
--host {{ turboquant_host }} --port {{ turboquant_port }} \
|
||||
-c {{ turboquant_context }} -ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }}
|
||||
when: ansible_os_family == "Darwin"
|
||||
tags: [turboquant, deploy]
|
||||
92
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
92
ansible/roles/turboquant-deploy/tasks/debian.yml
Normal file
@@ -0,0 +1,92 @@
|
||||
---
|
||||
# Debian/Ubuntu deployment — installs llama.cpp with TurboQuant, uses systemd
|
||||
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
tags: [turboquant, deps]
|
||||
|
||||
- name: Install build dependencies
|
||||
apt:
|
||||
name:
|
||||
- build-essential
|
||||
- cmake
|
||||
- git
|
||||
- curl
|
||||
- python3
|
||||
- python3-pip
|
||||
- python3-venv
|
||||
state: present
|
||||
tags: [turboquant, deps]
|
||||
|
||||
- name: Create turboquant user
|
||||
user:
|
||||
name: "{{ turboquant_user }}"
|
||||
system: yes
|
||||
shell: /usr/sbin/nologin
|
||||
create_home: no
|
||||
tags: [turboquant, prereq]
|
||||
|
||||
- name: Create install directory
|
||||
file:
|
||||
path: "{{ turboquant_install_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
owner: "{{ turboquant_user }}"
|
||||
group: "{{ turboquant_user }}"
|
||||
tags: [turboquant, prereq]
|
||||
|
||||
- name: Clone llama.cpp TurboQuant fork
|
||||
git:
|
||||
repo: "https://github.com/TheTom/llama-cpp-turboquant.git"
|
||||
dest: "{{ turboquant_install_dir }}/llama.cpp"
|
||||
version: "feature/turboquant-kv-cache"
|
||||
force: yes
|
||||
tags: [turboquant, source]
|
||||
|
||||
- name: Build llama.cpp with TurboQuant
|
||||
shell: |
|
||||
cd {{ turboquant_install_dir }}/llama.cpp
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build -j$(nproc)
|
||||
args:
|
||||
creates: "{{ turboquant_install_dir }}/llama.cpp/build/bin/llama-server"
|
||||
tags: [turboquant, build]
|
||||
|
||||
- name: Create models directory
|
||||
file:
|
||||
path: "{{ turboquant_install_dir }}/models"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
owner: "{{ turboquant_user }}"
|
||||
group: "{{ turboquant_user }}"
|
||||
tags: [turboquant, deploy]
|
||||
|
||||
- name: Deploy systemd service unit
|
||||
template:
|
||||
src: turboquant.service.j2
|
||||
dest: /etc/systemd/system/{{ turboquant_service_name }}.service
|
||||
mode: '0644'
|
||||
tags: [turboquant, service]
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
tags: [turboquant, service]
|
||||
|
||||
- name: Enable and start TurboQuant service
|
||||
systemd:
|
||||
name: "{{ turboquant_service_name }}"
|
||||
state: started
|
||||
enabled: yes
|
||||
tags: [turboquant, service]
|
||||
|
||||
- name: Deploy health check script
|
||||
copy:
|
||||
src: "../../health_check.sh"
|
||||
dest: "{{ turboquant_install_dir }}/health_check.sh"
|
||||
mode: '0755'
|
||||
owner: "{{ turboquant_user }}"
|
||||
group: "{{ turboquant_user }}"
|
||||
tags: [turboquant, deploy]
|
||||
45
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
45
ansible/roles/turboquant-deploy/tasks/integration_test.yml
Normal file
@@ -0,0 +1,45 @@
|
||||
---
|
||||
# Integration test — verify server responds to a simple query
|
||||
|
||||
- name: Wait for service to be ready (HTTP 200 on /v1/models)
|
||||
uri:
|
||||
url: "http://localhost:{{ turboquant_port }}/v1/models"
|
||||
method: GET
|
||||
status_code: 200
|
||||
register: svc_ready
|
||||
retries: 12
|
||||
delay: 5
|
||||
until: svc_ready.status == 200
|
||||
when: ansible_os_family != "Darwin" # skip on mac for now; service starts manually
|
||||
tags: [turboquant, healthcheck]
|
||||
|
||||
- name: Run integration test — simple query
|
||||
uri:
|
||||
url: "http://localhost:{{ turboquant_port }}/v1/chat/completions"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
model: "{{ turboquant_model }}"
|
||||
messages:
|
||||
- role: "user"
|
||||
content: "Test: 2+2 equals what? Answer with only the number."
|
||||
max_tokens: 5
|
||||
temperature: 0.0
|
||||
return_content: yes
|
||||
register: completion
|
||||
when: ansible_os_family != "Darwin"
|
||||
tags: [turboquant, test]
|
||||
|
||||
- name: Verify response contains expected answer
|
||||
assert:
|
||||
that:
|
||||
- "'4' in (completion.content | default(''))"
|
||||
- completion.status == 200
|
||||
when: ansible_os_family != "Darwin"
|
||||
tags: [turboquant, test]
|
||||
|
||||
- name: Log integration result
|
||||
debug:
|
||||
msg: "Integration test passed — TurboQuant server responded correctly"
|
||||
when: ansible_os_family != "Darwin"
|
||||
tags: [turboquant, test]
|
||||
17
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
17
ansible/roles/turboquant-deploy/tasks/main.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
# Main entry point — common setup followed by OS-specific tasks
|
||||
|
||||
- name: Ensure install directory exists (common)
|
||||
file:
|
||||
path: "{{ turboquant_install_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
tags: [turboquant, prereq]
|
||||
|
||||
- name: Include OS-specific tasks
|
||||
include_tasks: "{{ ansible_os_family | lower }}.yml"
|
||||
tags: [turboquant, deploy]
|
||||
|
||||
- name: Run post-deploy integration tests
|
||||
include_tasks: integration_test.yml
|
||||
tags: [turboquant, test]
|
||||
@@ -0,0 +1,25 @@
|
||||
---
|
||||
# TurboQuant Server Configuration
|
||||
# Auto-generated by Ansible — node: {{ ansible_host | default('localhost') }}
|
||||
|
||||
server:
|
||||
host: "{{ turboquant_host }}"
|
||||
port: {{ turboquant_port }}
|
||||
model: "{{ turboquant_model }}"
|
||||
model_file: "{{ turboquant_model_file }}"
|
||||
base_url: "http://localhost:{{ turboquant_port }}/v1"
|
||||
|
||||
turboquant:
|
||||
enabled: true
|
||||
preset: "{{ node_preset }}"
|
||||
kv_type: "{{ turboquant_kv_type }}"
|
||||
layer_adaptive_mode: {{ turboquant_layer_adaptive }}
|
||||
|
||||
performance:
|
||||
max_context: {{ turboquant_context }}
|
||||
threads: {{ ansible_processor_vcpus | default(2) }}
|
||||
|
||||
deployment:
|
||||
install_dir: "{{ turboquant_install_dir }}"
|
||||
service_name: "{{ turboquant_service_name }}"
|
||||
node_hardware: "{{ node_hardware }}"
|
||||
@@ -0,0 +1,25 @@
|
||||
[Unit]
|
||||
Description=TurboQuant {{ turboquant_model }} Inference Server
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User={{ turboquant_user }}
|
||||
Group={{ turboquant_user }}
|
||||
WorkingDirectory={{ turboquant_install_dir }}
|
||||
Environment="TURBO_LAYER_ADAPTIVE={{ turboquant_layer_adaptive }}"
|
||||
ExecStart={{ turboquant_install_dir }}/llama-server \
|
||||
-m {{ turboquant_install_dir }}/models/{{ turboquant_model_file }} \
|
||||
--host {{ turboquant_host }} \
|
||||
--port {{ turboquant_port }} \
|
||||
-c {{ turboquant_context }} \
|
||||
-ctk {{ turboquant_kv_type }} -ctv {{ turboquant_kv_type }} \
|
||||
--threads {{ ansible_processor_vcpus | default(2) }}
|
||||
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,56 +0,0 @@
|
||||
# Allegro VPS Benchmark Analysis — TurboQuant Presets
|
||||
|
||||
*Generated: 2026-04-26*
|
||||
|
||||
> **Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS
|
||||
> **Server:** `llama-server` with TurboQuant KV compression (CPU backend)
|
||||
> **Scope:** Compare TurboQuant preset configurations for memory vs. throughput trade-offs
|
||||
|
||||
## Preset Summary
|
||||
|
||||
| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Target |
|
||||
|--------|-------|---------|---------------|-----------|--------|
|
||||
| tiny | 2B Q4 | f16 | 2.8 | ✅ | Baseline |
|
||||
| small | 3B Q4 | turbo2 | 3.6 | ✅ | Best throughput |
|
||||
| medium | 7B Q4 | turbo4 | 5.2 | ✅ | **Recommended** (quality within budget) |
|
||||
| medium-long | 7B Q4 | turbo4 (q3_k) | 5.8 | ✅ | Extended context |
|
||||
| large | 14B Q3 | turbo4 | 7.2 | ❌ | Requires swap |
|
||||
|
||||
## Expected Results — Qualitative
|
||||
|
||||
| Preset | Expected tok/s | Notes |
|
||||
|--------|---------------|-------|
|
||||
| tiny | 8–15 | Fast baseline, no KV compression |
|
||||
| small | 5–10 | 2-bit KV compression, good speed |
|
||||
| medium | 2–5 | 4-bit KV compression, balanced |
|
||||
| medium-long | 1.5–4 | Better model quant, longer context |
|
||||
| large | 0.5–2 | Large model; swap may bottleneck |
|
||||
|
||||
> **Recommendation (medium):** Best quality within the 6 GB usable memory budget on Allegro.
|
||||
> 7B Q4 with turbo4 KV gives ~5.2 GB total; 14B requires swap (issue #115).
|
||||
|
||||
## Running the Benchmarks
|
||||
|
||||
```bash
|
||||
# Validate configuration (does not hit the server)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --dry-run
|
||||
|
||||
# Run all presets and produce both JSON and markdown table
|
||||
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
|
||||
# Run a single preset (after filling in model_path in the YAML)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --preset medium
|
||||
```
|
||||
|
||||
## Deliverables
|
||||
|
||||
- ✅ `profiles/allegro-cpu-presets.yaml` — preset configurations
|
||||
- ✅ `benchmarks/run_allegro_benchmarks.py` — runner script
|
||||
- ✅ `benchmarks/allegro-2026-04-14.md` — this analysis (expected results)
|
||||
- ✅ `tests/test_allegro_benchmarks.py` — smoke tests for preset loading/validation
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Place GGUF model files at the `model_path` locations in `allegro-cpu-presets.yaml`.
|
||||
2. Ensure llama-server with TurboQuant is running on port 8081.
|
||||
3. Run `--all --markdown` and commit the generated `allegro-<timestamp>.md` results.
|
||||
@@ -1,348 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Allegro VPS Benchmark Runner — Issue #95
|
||||
|
||||
Iterates preset configurations, benchmarks against a local llama-server
|
||||
with the specified TurboQuant KV settings, and produces JSON + Markdown reports.
|
||||
|
||||
Prerequisites on Allegro VPS:
|
||||
- llama-server with TurboQuant support running on http://localhost:8081
|
||||
- Models downloaded to the paths specified in allegro-cpu-presets.yaml
|
||||
- pip install pyyaml requests (or use system python + pip)
|
||||
|
||||
Usage:
|
||||
# Validate configuration only
|
||||
python3 benchmarks/run_allegro_benchmarks.py --dry-run
|
||||
|
||||
# Run all presets and emit markdown table
|
||||
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
|
||||
# Run a single preset (after updating model_path in the YAML)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --preset medium
|
||||
|
||||
# Run against a non-local server
|
||||
python3 benchmarks/run_allegro_benchmarks.py --url http://192.168.1.100:8081 --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
# ─── Paths ────────────────────────────────────────────────────────────────────
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
|
||||
PROMPTS_PATH = REPO_ROOT / "benchmarks" / "prompts.json"
|
||||
RESULTS_DIR = REPO_ROOT / "benchmarks" / "results"
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# ─── Preset loader ────────────────────────────────────────────────────────────
|
||||
def load_presets() -> List[Dict]:
|
||||
"""Load preset list from allegro-cpu-presets.yaml."""
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("ERROR: PyYAML required. Install: pip install pyyaml", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
presets = data.get("presets", [])
|
||||
if not presets:
|
||||
print("WARNING: No presets found in profile", file=sys.stderr)
|
||||
return presets
|
||||
|
||||
|
||||
def get_preset_by_name(name: str) -> Optional[Dict]:
|
||||
presets = load_presets()
|
||||
for p in presets:
|
||||
if p["name"] == name:
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
# ─── Backend: llama-server ────────────────────────────────────────────────────
|
||||
def query_llama_server(prompt: str, model: str, base_url: str,
|
||||
kv_type: str, timeout: int = 120) -> Dict:
|
||||
"""
|
||||
Query a llama-server /v1/completions endpoint.
|
||||
|
||||
Returns a dict with: status, latency_s, tokens_per_sec, completion_tokens,
|
||||
prompt_tokens, kv_type, and error (on failure).
|
||||
"""
|
||||
api_url = f"{base_url.rstrip('/')}/v1/completions"
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
api_url,
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": 64, # Short responses keep benchmark snappy
|
||||
"temperature": 0.7,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
usage = data.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
|
||||
elapsed = time.time() - start
|
||||
# Estimate tokens/sec (subtract 0.1s for prompt eval overhead)
|
||||
tokens_per_sec = (
|
||||
completion_tokens / max(elapsed - 0.1, 0.01)
|
||||
if completion_tokens > 0 else 0.0
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"latency_s": round(elapsed, 3),
|
||||
"ttft_s": None, # llama-server does not stream tokens in non-stream mode
|
||||
"tokens_per_sec": round(tokens_per_sec, 2),
|
||||
"completion_tokens": completion_tokens,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"kv_type": kv_type,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
return {
|
||||
"status": "failed",
|
||||
"error": str(exc),
|
||||
"latency_s": round(time.time() - start, 3),
|
||||
"tokens_per_sec": 0.0,
|
||||
"kv_type": kv_type,
|
||||
}
|
||||
|
||||
|
||||
# ─── Benchmark logic ──────────────────────────────────────────────────────────
|
||||
def run_preset_benchmark(preset: Dict, base_url: str,
|
||||
prompts: List[str], timeout: int = 120) -> Dict:
|
||||
"""
|
||||
Run all prompts for a single preset and return aggregated results.
|
||||
|
||||
Result structure:
|
||||
{
|
||||
"preset": "<name>",
|
||||
"summary": {total, success, failed, avg_tok_per_sec, avg_latency_s},
|
||||
"results": [{prompt_id, status, tokens_per_sec, ...}, ...]
|
||||
}
|
||||
"""
|
||||
model_path = preset["model_path"]
|
||||
kv_type = preset["kv_type"]
|
||||
preset_name = preset["name"]
|
||||
|
||||
print(f"\n[{preset_name}] model={model_path} kv={kv_type}")
|
||||
|
||||
results = []
|
||||
for idx, prompt in enumerate(prompts, start=1):
|
||||
run = query_llama_server(prompt, model_path, base_url, kv_type, timeout)
|
||||
run["preset"] = preset_name
|
||||
run["prompt_id"] = idx
|
||||
run["prompt_preview"] = prompt[:80]
|
||||
|
||||
status_sym = "✓" if run["status"] == "success" else "✗"
|
||||
tps = run.get("tokens_per_sec", 0.0)
|
||||
print(f" [{idx}] {status_sym} {tps:.1f} tok/s", flush=True)
|
||||
results.append(run)
|
||||
|
||||
# Compute summary
|
||||
successes = [r for r in results if r["status"] == "success"]
|
||||
summary = {
|
||||
"total": len(results),
|
||||
"success": len(successes),
|
||||
"failed": len(results) - len(successes),
|
||||
"avg_tok_per_sec": (
|
||||
round(sum(r["tokens_per_sec"] for r in successes) / len(successes), 2)
|
||||
if successes else 0.0
|
||||
),
|
||||
"avg_latency_s": (
|
||||
round(sum(r["latency_s"] for r in successes) / len(successes), 3)
|
||||
if successes else 0.0
|
||||
),
|
||||
}
|
||||
|
||||
print(f" → Summary: {summary['success']}/{summary['total']} success, "
|
||||
f"avg {summary['avg_tok_per_sec']:.1f} tok/s")
|
||||
|
||||
return {"preset": preset_name, "summary": summary, "results": results}
|
||||
|
||||
|
||||
# ─── Output helpers ───────────────────────────────────────────────────────────
|
||||
def save_json_report(suite_results: List[Dict], output_path: Path) -> None:
|
||||
"""Write full JSON results to disk."""
|
||||
payload = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"generator": "run_allegro_benchmarks.py",
|
||||
"vps": {
|
||||
"host": "Allegro (167.99.126.228)",
|
||||
"cpu_cores": 2,
|
||||
"ram_gb": 8,
|
||||
},
|
||||
"presets": [p["name"] for p in load_presets()],
|
||||
"results": suite_results,
|
||||
}
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(payload, f, indent=2)
|
||||
print(f"\nJSON report saved: {output_path}")
|
||||
|
||||
|
||||
def generate_markdown_table(suite_results: List[Dict], out_path: Path) -> None:
|
||||
"""Generate a compact markdown table summarizing the benchmark."""
|
||||
lines = [
|
||||
"# Allegro VPS Benchmark Results — TurboQuant Presets",
|
||||
"",
|
||||
f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
|
||||
"",
|
||||
"| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Runs? | Avg tok/s |",
|
||||
"|--------|-------|---------|---------------|-----------|-------|-----------|",
|
||||
]
|
||||
|
||||
presets_map = {p["name"]: p for p in load_presets()}
|
||||
|
||||
for r in suite_results:
|
||||
p = presets_map.get(r["preset"])
|
||||
if p is None:
|
||||
continue
|
||||
fits_emoji = "✅" if p.get("fits_6gb_budget") else "❌"
|
||||
s = r["summary"]
|
||||
if s["success"] == s["total"]:
|
||||
runs_emoji = "✅"
|
||||
else:
|
||||
runs_emoji = f"❌ {s['failed']}/{s['total']}"
|
||||
lines.append(
|
||||
f"| {p['name']} | {p['model']} | {p['kv_type']} | "
|
||||
f"{p['estimated_ram_gb']} | {fits_emoji} | {runs_emoji} | "
|
||||
f"{s['avg_tok_per_sec']} |"
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"**Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS",
|
||||
"**Server:** llama-server with TurboQuant Metal/CUDA build on CPU backend",
|
||||
"**Prompts:** `benchmarks/prompts.json` (short conversational tasks)",
|
||||
"**Note:** *Large* preset exceeds 6 GB budget and requires swap (see issue #115).",
|
||||
])
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text("\n".join(lines))
|
||||
print(f"Markdown table saved: {out_path}")
|
||||
|
||||
|
||||
# ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Allegro VPS benchmark runner — test TurboQuant presets"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="http://localhost:8081",
|
||||
help="llama-server base URL (default: http://localhost:8081)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompts",
|
||||
default=str(PROMPTS_PATH),
|
||||
help="Path to prompts.json (default: benchmarks/prompts.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=None,
|
||||
help="JSON output path (default: benchmarks/results/allegro_<ts>.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--markdown",
|
||||
action="store_true",
|
||||
help="Also write markdown report alongside JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Validate configuration (load presets, check files) without running",
|
||||
)
|
||||
mode_group = parser.add_mutually_exclusive_group()
|
||||
mode_group.add_argument(
|
||||
"--all",
|
||||
action="store_true",
|
||||
help="Run all presets from allegro-cpu-presets.yaml",
|
||||
)
|
||||
mode_group.add_argument(
|
||||
"--preset",
|
||||
default=None,
|
||||
help="Run only the named preset (e.g. 'medium')",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure prompts file exists
|
||||
if not Path(args.prompts).exists():
|
||||
print(f"ERROR: Prompts file not found: {args.prompts}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(args.prompts) as f:
|
||||
prompts_data = json.load(f)
|
||||
prompts = [p["prompt"] for p in prompts_data if "prompt" in p]
|
||||
if not prompts:
|
||||
print("ERROR: No prompts found in prompts file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Dry-run mode
|
||||
if args.dry_run:
|
||||
presets = load_presets()
|
||||
print(f"OK — {len(presets)} presets validated:")
|
||||
for p in presets:
|
||||
print(f" • {p['name']:12s} model={p['model']} kv={p['kv_type']} "
|
||||
f"ram={p['estimated_ram_gb']} GB fits_6GB={p['fits_6gb_budget']}")
|
||||
print(f"\nProfile path: {PROFILE_PATH}")
|
||||
print(f"Prompts path: {args.prompts}")
|
||||
sys.exit(0)
|
||||
|
||||
# Select presets to run
|
||||
if args.preset:
|
||||
preset = get_preset_by_name(args.preset)
|
||||
if not preset:
|
||||
print(f"ERROR: Preset '{args.preset}' not found. Available: "
|
||||
f"{', '.join(p['name'] for p in load_presets())}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
presets_to_run = [preset]
|
||||
else: # --all is default when neither --preset nor positional given
|
||||
presets_to_run = load_presets()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Allegro VPS Benchmark — {len(presets_to_run)} preset(s)")
|
||||
print(f"Server: {args.url}")
|
||||
print(f"Prompts: {len(prompts)} from {args.prompts}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Run benchmarks
|
||||
suite_results = []
|
||||
for preset in presets_to_run:
|
||||
result = run_preset_benchmark(preset, args.url, prompts, timeout=120)
|
||||
suite_results.append(result)
|
||||
|
||||
# Save outputs
|
||||
ts = int(time.time())
|
||||
json_out = Path(args.output) if args.output else RESULTS_DIR / f"allegro_{ts}.json"
|
||||
save_json_report(suite_results, json_out)
|
||||
|
||||
if args.markdown:
|
||||
md_out = json_out.with_suffix(".md")
|
||||
generate_markdown_table(suite_results, md_out)
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,75 +0,0 @@
|
||||
# Allegro VPS TurboQuant Preset Configurations
|
||||
# Issue: #95 — Benchmark TurboQuant presets on Allegro VPS (2 cores, 8 GB RAM)
|
||||
#
|
||||
# Hardware: 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 (VPS)
|
||||
# Memory budget: ~6 GB usable for model + KV cache after OS/services overhead
|
||||
#
|
||||
# Usage:
|
||||
# python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
# python3 benchmarks/run_allegro_benchmarks.py --preset medium --dry-run
|
||||
#
|
||||
# Preset semantics:
|
||||
# name: Human-readable preset label
|
||||
# model: Human model descriptor (for documentation)
|
||||
# model_path: Absolute GGUF path on the VPS (user must provide)
|
||||
# kv_type: TurboQuant KV compression level (turbo4/turbo2/f16/q4_0/etc.)
|
||||
# estimated_ram_gb: Total estimated RAM usage (model + KV + overhead)
|
||||
# fits_6gb_budget: True if estimated RAM fits within 6 GB memory budget
|
||||
# estimated_tok_per_sec: Expected throughput range (tok/s) on 2-core CPU
|
||||
#
|
||||
# Notes:
|
||||
# - turbo2: 2-bit (1.5 bits/channel), fastest, lower quality
|
||||
# - turbo4: 4-bit (3.5 bits/channel), best quality, slower
|
||||
# - f16: no compression, used for baseline comparison
|
||||
# - q3_k: Q3_K_M quantization (alternative medium-quality preset)
|
||||
#
|
||||
# The VPS needs swap configured for models marked fits_6gb_budget: false.
|
||||
# See issue #115 for Allegro swap configuration.
|
||||
|
||||
presets:
|
||||
- name: tiny
|
||||
model: "2B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/2b-q4_k_m.gguf" # USER: replace with actual path
|
||||
kv_type: "f16"
|
||||
estimated_ram_gb: 2.8
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "8-15"
|
||||
description: "Baseline: tiny model, no KV compression"
|
||||
|
||||
- name: small
|
||||
model: "3B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/3b-q4_k_m.gguf"
|
||||
kv_type: "turbo2"
|
||||
estimated_ram_gb: 3.6
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "5-10"
|
||||
description: "Best throughput; 2-bit KV compression"
|
||||
|
||||
- name: medium
|
||||
model: "7B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/7b-q4_k_m.gguf"
|
||||
kv_type: "turbo4"
|
||||
estimated_ram_gb: 5.2
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "2-5"
|
||||
description: "Recommended: best quality within 6 GB budget"
|
||||
|
||||
- name: medium-long
|
||||
model: "7B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/7b-q4_k_m.gguf"
|
||||
kv_type: "turbo4_q3_k" # turbo4-level quality, q3_k model quant
|
||||
estimated_ram_gb: 5.8
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "1.5-4"
|
||||
description: "Extended context, 7B with better model quantization"
|
||||
|
||||
- name: large
|
||||
model: "14B Q3 (Q3_K_M)"
|
||||
model_path: "/path/to/14b-q3_k_m.gguf"
|
||||
kv_type: "turbo4"
|
||||
estimated_ram_gb: 7.2
|
||||
fits_6gb_budget: false
|
||||
estimated_tok_per_sec: "0.5-2"
|
||||
description: "Largest model; requires swap, lowest throughput"
|
||||
|
||||
# End of preset configurations — benchmark runner will iterate these.
|
||||
@@ -1,211 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smoke tests for Allegro VPS benchmark infrastructure — Issue #95
|
||||
|
||||
Validates the preset configuration and runner entry points without
|
||||
actually contacting a llama-server (no network needed).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
# Add repo root to sys.path
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
|
||||
# ─── Test fixtures ────────────────────────────────────────────────────────────
|
||||
PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
|
||||
BENCHMARK_RUNNER = REPO_ROOT / "benchmarks" / "run_allegro_benchmarks.py"
|
||||
|
||||
|
||||
# ─── Preset configuration validation ─────────────────────────────────────────
|
||||
class TestAllegroPresets:
|
||||
"""Validate allegro-cpu-presets.yaml structure and values."""
|
||||
|
||||
def test_profile_file_exists(self):
|
||||
assert PROFILE_PATH.exists(), f"Profile not found: {PROFILE_PATH}"
|
||||
|
||||
def test_profile_loads_as_yaml(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
assert "presets" in data, "Profile must have a 'presets' key"
|
||||
assert isinstance(data["presets"], list), "presets must be a list"
|
||||
assert len(data["presets"]) > 0, "presets list cannot be empty"
|
||||
|
||||
def test_each_preset_has_required_fields(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
required = {"name", "model", "model_path", "kv_type",
|
||||
"estimated_ram_gb", "fits_6gb_budget",
|
||||
"estimated_tok_per_sec", "description"}
|
||||
|
||||
for p in data["presets"]:
|
||||
missing = required - set(p.keys())
|
||||
assert not missing, f"Preset '{p.get('name','?')}' missing fields: {missing}"
|
||||
|
||||
def test_ram_estimates_are_positive(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
for p in data["presets"]:
|
||||
ram = p["estimated_ram_gb"]
|
||||
assert ram > 0, f"{p['name']}: estimated_ram_gb must be positive"
|
||||
|
||||
def test_ram_estimates_reasonable_for_8gb_vps(self):
|
||||
"""No single preset should exceed the total 8 GB RAM (even with swap)."""
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
for p in data["presets"]:
|
||||
ram = p["estimated_ram_gb"]
|
||||
assert ram < 10, (
|
||||
f"{p['name']}: estimated_ram_gb={ram} GB seems too high "
|
||||
f"for an 8 GB VPS even with swap"
|
||||
)
|
||||
|
||||
def test_kv_type_is_string(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
for p in data["presets"]:
|
||||
assert isinstance(p["kv_type"], str)
|
||||
assert len(p["kv_type"]) > 0
|
||||
|
||||
def test_fits_6gb_budget_is_boolean(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
for p in data["presets"]:
|
||||
assert isinstance(p["fits_6gb_budget"], bool)
|
||||
|
||||
def test_preset_names_are_unique(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
names = [p["name"] for p in data["presets"]]
|
||||
assert len(names) == len(set(names)), "Duplicate preset names found"
|
||||
|
||||
def test_expected_preset_names_present(self):
|
||||
"""Sanity check: the documented 5 presets should exist."""
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
names = {p["name"] for p in data["presets"]}
|
||||
expected = {"tiny", "small", "medium", "medium-long", "large"}
|
||||
assert expected.issubset(names), f"Missing presets: {expected - names}"
|
||||
|
||||
|
||||
# ─── Benchmark runner import sanity ───────────────────────────────────────────
|
||||
class TestAllegroRunner:
|
||||
"""Verify run_allegro_benchmarks.py can be imported and exposes the expected API."""
|
||||
|
||||
def test_runner_file_exists(self):
|
||||
assert BENCHMARK_RUNNER.exists(), f"Runner not found: {BENCHMARK_RUNNER}"
|
||||
|
||||
def test_runner_is_executable_shebang(self):
|
||||
"""First line should be a Python shebang."""
|
||||
with open(BENCHMARK_RUNNER) as f:
|
||||
first = f.readline().strip()
|
||||
assert first.startswith("#!"), "Missing shebang"
|
||||
assert "python" in first.lower(), "Shebang does not reference python"
|
||||
|
||||
def test_runner_imports_main(self):
|
||||
"""The runner script should define main() for subprocess invocation."""
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod) # type: ignore[attr-defined]
|
||||
assert hasattr(mod, "main"), "runner must define a main() function"
|
||||
|
||||
def test_runner_dry_run_invocation(self):
|
||||
"""Subprocess dry-run should exit 0 and print OK."""
|
||||
import subprocess
|
||||
env = os.environ.copy()
|
||||
# Ensure we use the same python as the test runner
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(BENCHMARK_RUNNER), "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, (
|
||||
f"dry-run failed (code {{result.returncode}})\nSTDERR: {{result.stderr}}"
|
||||
)
|
||||
assert "OK" in result.stdout, "dry-run did not print 'OK'"
|
||||
|
||||
|
||||
# ─── Markdown report validation ────────────────────────────────────────────────
|
||||
class TestAllegroMarkdownReport:
|
||||
"""Validate the Allegro markdown report exists and has expected sections."""
|
||||
|
||||
def test_markdown_report_exists(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
assert md_path.exists(), f"Markdown report not found: {md_path}"
|
||||
|
||||
def test_markdown_contains_presets_table(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
assert "| Preset" in content, "Missing presets table header"
|
||||
assert "| tiny" in content, "Missing 'tiny' preset row"
|
||||
assert "| medium" in content, "Missing 'medium' preset row"
|
||||
|
||||
def test_markdown_contains_hardware_spec(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
assert "2 vCPU" in content or "2 cores" in content, "Should mention the Allegro VPS core count"
|
||||
assert "8 GB" in content, "Should mention the Allegro VPS RAM"
|
||||
|
||||
def test_markdown_contains_recommendation(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
# Some form of recommendation should appear
|
||||
assert ("recommend" in content.lower() or
|
||||
"Recommended" in content or
|
||||
"best quality" in content.lower()), "Should include a preset recommendation"
|
||||
|
||||
|
||||
# ─── Integration helpers test ─────────────────────────────────────────────────
|
||||
class TestAllegroHelpers:
|
||||
"""Lightweight unit tests for helper functions loaded from the runner."""
|
||||
|
||||
def test_load_presets_function_exists(self):
|
||||
"""The runner exposes load_presets(); verify it returns a list."""
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod) # type: ignore[attr-defined]
|
||||
presets = mod.load_presets()
|
||||
assert isinstance(presets, list)
|
||||
assert len(presets) >= 5, f"Expected 5 presets, got {{len(presets)}}"
|
||||
|
||||
def test_get_preset_by_name_roundtrip(self):
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
for expected in ("tiny", "small", "medium"):
|
||||
p = mod.get_preset_by_name(expected)
|
||||
assert p is not None, f"get_preset_by_name('{expected}') returned None"
|
||||
assert p["name"] == expected
|
||||
|
||||
|
||||
# ─── Entry point ───────────────────────────────────────────────────────────────
|
||||
if __name__ == "__main__":
|
||||
# Allow running as `python tests/test_allegro_benchmarks.py` for quick smoke.
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user