Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
450b40862a feat: multi-backend benchmark suite with TTFT + memory tracking (refs #29)
Some checks failed
Smoke Test / smoke (pull_request) Failing after 5s
2026-04-13 04:15:22 -04:00

View File

@@ -1,75 +1,227 @@
#!/usr/bin/env python3
"""
TurboQuant Benchmarking Suite — Multi-Backend (Issue #29)
Supports Ollama and llama-server backends with KV cache type configuration.
Measures: TTFT, tokens/sec, latency, peak memory.
Usage:
# Ollama (default)
python3 benchmarks/run_benchmarks.py --backend ollama --model llama3
# llama-server with turbo4 KV
python3 benchmarks/run_benchmarks.py --backend llama-server \
--url http://localhost:11434 --model qwen3.5 --kv-type turbo4
"""
import argparse
import json import json
import time
import requests
import os import os
from typing import List, Dict import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from typing import List, Dict, Optional
# ═══════════════════════════════════════════ import requests
# TURBOQUANT BENCHMARKING SUITE (Issue #16)
# ═══════════════════════════════════════════
# This script runs a standardized set of prompts against the local inference
# engine (Ollama) and logs the results. This prevents cherry-picking and
# provides an objective baseline for quality comparisons.
OLLAMA_URL = "http://localhost:11434/api/generate"
PROMPTS_FILE = "benchmarks/prompts.json"
RESULTS_FILE = f"benchmarks/results_{int(time.time())}.json"
def run_benchmark(model: str = "llama3"): def get_peak_memory_mb() -> float:
"""Run the benchmark suite for a specific model.""" """Get peak RSS of current process in MB (macOS/Linux)."""
if not os.path.exists(PROMPTS_FILE): try:
print(f"Error: {PROMPTS_FILE} not found.") if sys.platform == "darwin":
return result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())],
capture_output=True, text=True)
return int(result.stdout.strip()) / 1024
else:
with open(f"/proc/{os.getpid()}/status") as f:
for line in f:
if line.startswith("VmHWM:"):
return int(line.split()[1]) / 1024
except Exception:
pass
return 0.0
with open(PROMPTS_FILE, 'r') as f:
def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
"""Run a prompt against Ollama /api/generate."""
api_url = f"{url.rstrip('/')}/api/generate"
start = time.time()
ttft = None
tokens_per_sec = 0.0
try:
resp = requests.post(api_url, json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": 512}
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
prompt_eval_ns = data.get("prompt_eval_duration", 0)
if eval_duration_ns > 0:
tokens_per_sec = eval_count / (eval_duration_ns / 1e9)
if prompt_eval_ns > 0:
ttft = prompt_eval_ns / 1e9
return {
"response": response_text,
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tokens_per_sec, 2),
"eval_count": eval_count,
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16",
timeout: int = 120) -> dict:
"""Run a prompt against llama-server OpenAI-compatible API."""
api_url = f"{url.rstrip('/')}/v1/chat/completions"
start = time.time()
ttft = None
tokens_per_sec = 0.0
try:
resp = requests.post(api_url, json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 512,
"stream": False
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
prompt_tokens = usage.get("prompt_tokens", 0)
# llama-server includes timing in x_* headers or we estimate
if elapsed > 0 and completion_tokens > 0:
# Subtract estimated prompt eval time (rough)
tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01)
return {
"response": response_text,
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tokens_per_sec, 2),
"completion_tokens": completion_tokens,
"prompt_tokens": prompt_tokens,
"kv_type": kv_type,
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str,
prompts_file: str, output_file: str, timeout: int = 120):
"""Run the full benchmark suite."""
if not os.path.exists(prompts_file):
print(f"ERROR: {prompts_file} not found")
sys.exit(1)
with open(prompts_file) as f:
prompts = json.load(f) prompts = json.load(f)
run_fn = run_ollama if backend == "ollama" else run_llama_server
mem_before = get_peak_memory_mb()
results = [] results = []
print(f"Starting benchmark for model: {model}") print(f"\n{'='*60}")
print(f"Saving results to: {RESULTS_FILE}") print(f"Backend: {backend} | Model: {model} | KV: {kv_type}")
print(f"URL: {url}")
print(f"Prompts: {len(prompts)} | Output: {output_file}")
print(f"{'='*60}\n")
for item in prompts: for item in prompts:
print(f"Running prompt: {item['id']}...") pid = item.get("id", item.get("category", "unknown"))
prompt = item["prompt"]
start_time = time.time() print(f"[{pid}] Running...", end=" ", flush=True)
try:
response = requests.post(OLLAMA_URL, json={ extra = {"kv_type": kv_type} if backend == "llama-server" else {}
"model": model, result = run_fn(prompt, model, url, timeout=timeout)
"prompt": item['prompt'], result["id"] = pid
"stream": False result["prompt_preview"] = prompt[:120]
}, timeout=60) result.update(extra)
response.raise_for_status() status = "" if result["status"] == "success" else ""
data = response.json() tps = result.get("tokens_per_sec", 0)
end_time = time.time() lat = result.get("latency_s", 0)
print(f"{status} {tps:.1f} tok/s, {lat:.2f}s")
results.append({
"id": item['id'], results.append(result)
"prompt": item['prompt'],
"response": data.get("response"), mem_after = get_peak_memory_mb()
"latency": end_time - start_time,
"tokens_per_second": data.get("eval_count", 0) / (data.get("eval_duration", 1) / 1e9) if data.get("eval_duration") else 0, suite = {
"status": "success" "timestamp": datetime.now(timezone.utc).isoformat(),
}) "backend": backend,
except Exception as e: "model": model,
print(f"Error running prompt {item['id']}: {e}") "kv_type": kv_type,
results.append({ "url": url,
"id": item['id'], "prompts_file": prompts_file,
"prompt": item['prompt'], "memory_mb": round(max(mem_before, mem_after), 1),
"error": str(e), "results": results,
"status": "failed" "summary": {
}) "total": len(results),
"success": sum(1 for r in results if r["status"] == "success"),
"failed": sum(1 for r in results if r["status"] == "failed"),
"avg_tok_per_sec": round(
sum(r.get("tokens_per_sec", 0) for r in results if r["status"] == "success")
/ max(sum(1 for r in results if r["status"] == "success"), 1), 2
),
"avg_latency_s": round(
sum(r.get("latency_s", 0) for r in results if r["status"] == "success")
/ max(sum(1 for r in results if r["status"] == "success"), 1), 3
),
}
}
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(suite, f, indent=2)
s = suite["summary"]
print(f"\n{'='*60}")
print(f"RESULTS: {s['success']}/{s['total']} success | "
f"Avg {s['avg_tok_per_sec']:.1f} tok/s | "
f"Avg {s['avg_latency_s']:.2f}s latency")
print(f"{'='*60}")
print(f"Saved to {output_file}")
def main():
parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite")
parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama")
parser.add_argument("--model", required=True, help="Model name")
parser.add_argument("--url", default="http://localhost:11434", help="Backend URL")
parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)")
parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file")
parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)")
parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)")
args = parser.parse_args()
if args.output is None:
ts = int(time.time())
args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json"
run_benchmark_suite(args.backend, args.model, args.url, args.kv_type,
args.prompts, args.output, args.timeout)
# Save results
with open(RESULTS_FILE, 'w') as f:
json.dump({
"model": model,
"timestamp": time.time(),
"results": results
}, f, indent=2)
print("Benchmark complete.")
if __name__ == "__main__": if __name__ == "__main__":
# Default to llama3 for testing main()
run_benchmark("llama3")