Files
turboquant/benchmarks/run_benchmarks.py
Rockachopa ccbcc8ab7b
All checks were successful
Smoke Test / smoke (pull_request) Successful in 27s
fix(benchmarks): separate quality measurement from efficiency proxy (issue #63)
- Add --quality flag to run_benchmarks.py that delegates to llama-perplexity
- Clarify token/sec is an efficiency metric, not perplexity
- Ollama cannot provide true logprob-based PPL (no logprob API)
- Quality gate now runs llama-perplexity binary directly when requested

Closes #63
2026-04-26 10:55:40 -04:00

323 lines
13 KiB
Python

#!/usr/bin/env python3
"""
TurboQuant Benchmarking Suite — Multi-Backend (Issue #29, #63)
Supports Ollama and llama-server backends with KV cache type configuration.
Measures: TTFT, tokens/sec, latency, peak memory.
Perplexity (quality) is NOT measured here tokens/sec is a throughput proxy.
For actual quality (logprob-based PPL), use --quality flag which delegates to
llama-perplexity binary, since Ollama lacks logprob support (issue #63).
Usage:
# Ollama (efficiency only)
python3 benchmarks/run_benchmarks.py --backend ollama --model llama3
# llama-server with turbo4 KV + quality gate in one shot
python3 benchmarks/run_benchmarks.py --backend llama-server \
--url http://localhost:11434 --model qwen3.5 --kv-type turbo4 --quality
# Quality gate only (separate tool)
python3 benchmarks/run_perplexity.py --model ~/models/qwen3.5-27b.gguf \
--llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
--corpus corpora/wiki.test.raw --context 2048
"""
import argparse
import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
from typing import List, Dict, Optional
import requests
def get_peak_memory_mb() -> float:
"""Get peak RSS of current process in MB (macOS/Linux)."""
try:
if sys.platform == "darwin":
result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())],
capture_output=True, text=True)
return int(result.stdout.strip()) / 1024
else:
with open(f"/proc/{os.getpid()}/status") as f:
for line in f:
if line.startswith("VmHWM:"):
return int(line.split()[1]) / 1024
except Exception:
pass
return 0.0
def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
"""Run a prompt against Ollama /api/generate."""
api_url = f"{url.rstrip('/')}/api/generate"
start = time.time()
ttft = None
tokens_per_sec = 0.0
try:
resp = requests.post(api_url, json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": 512}
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
prompt_eval_ns = data.get("prompt_eval_duration", 0)
if eval_duration_ns > 0:
tokens_per_sec = eval_count / (eval_duration_ns / 1e9)
if prompt_eval_ns > 0:
ttft = prompt_eval_ns / 1e9
return {
"response": response_text,
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tokens_per_sec, 2),
"eval_count": eval_count,
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16",
timeout: int = 120) -> dict:
"""Run a prompt against llama-server OpenAI-compatible API."""
api_url = f"{url.rstrip('/')}/v1/chat/completions"
start = time.time()
ttft = None
tokens_per_sec = 0.0
try:
resp = requests.post(api_url, json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 512,
"stream": False
}, timeout=timeout)
elapsed = time.time() - start
resp.raise_for_status()
data = resp.json()
response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
prompt_tokens = usage.get("prompt_tokens", 0)
if elapsed > 0 and completion_tokens > 0:
tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01)
return {
"response": response_text,
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tokens_per_sec, 2),
"completion_tokens": completion_tokens,
"prompt_tokens": prompt_tokens,
"kv_type": kv_type,
"status": "success"
}
except Exception as e:
return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)}
def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str,
prompts_file: str, output_file: str, timeout: int = 120,
measure_quality: bool = False, quality_corpus: str = None,
llama_cpp_bin: str = None, context: int = 2048, threads: int = 4):
"""Run the full benchmark suite, optionally measuring perplexity in parallel."""
if not os.path.exists(prompts_file):
print(f"ERROR: {prompts_file} not found")
sys.exit(1)
with open(prompts_file) as f:
prompts = json.load(f)
run_fn = run_ollama if backend == "ollama" else run_llama_server
mem_before = get_peak_memory_mb()
results = []
print(f"\n{'='*60}")
print(f"Backend: {backend} | Model: {model} | KV: {kv_type}")
print(f"URL: {url}")
print(f"Prompts: {len(prompts)} | Output: {output_file}")
print(f"{'='*60}\n")
for item in prompts:
pid = item.get("id", item.get("category", "unknown"))
prompt = item["prompt"]
print(f"[{pid}] Running...", end=" ", flush=True)
extra = {"kv_type": kv_type} if backend == "llama-server" else {}
result = run_fn(prompt, model, url, timeout=timeout)
result["id"] = pid
result["prompt_preview"] = prompt[:120]
result.update(extra)
status = "" if result["status"] == "success" else ""
tps = result.get("tokens_per_sec", 0)
lat = result.get("latency_s", 0)
print(f"{status} {tps:.1f} tok/s, {lat:.2f}s")
results.append(result)
mem_after = get_peak_memory_mb()
suite = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"backend": backend,
"model": model,
"kv_type": kv_type,
"url": url,
"prompts_file": prompts_file,
"memory_mb": round(max(mem_before, mem_after), 1),
"results": results,
"summary": {
"total": len(results),
"success": sum(1 for r in results if r["status"] == "success"),
"failed": sum(1 for r in results if r["status"] == "failed"),
"avg_tok_per_sec": round(
sum(r.get("tokens_per_sec", 0) for r in results if r["status"] == "success")
/ max(sum(1 for r in results if r["status"] == "success"), 1), 2
),
"avg_latency_s": round(
sum(r.get("latency_s", 0) for r in results if r["status"] == "success")
/ max(sum(1 for r in results if r["status"] == "success"), 1), 3
),
}
}
# Issue #63: Optional quality measurement via llama-perplexity (Ollama lacks logprob)
if measure_quality:
print("\n" + "="*60)
print("Quality measurement requested — invoking llama-perplexity binary...")
llama_cpp_bin = llama_cpp_bin or "llama.cpp-fork/build/bin/llama-perplexity"
quality_corpus = quality_corpus or "corpora/wiki.test.raw"
if not os.path.exists(quality_corpus):
print(f"WARNING: quality corpus not found: {quality_corpus}")
suite["quality"] = {"perplexity": None, "passed": False, "error": f"Corpus missing: {quality_corpus}"}
elif not os.path.exists(llama_cpp_bin):
print(f"WARNING: llama-perplexity binary not found: {llama_cpp_bin}")
suite["quality"] = {"perplexity": None, "passed": False, "error": f"Binary missing: {llama_cpp_bin}"}
else:
cmd = [
llama_cpp_bin,
"-m", model,
"-f", quality_corpus,
"-c", str(context),
"-t", str(threads),
"--kv-type", kv_type,
]
try:
start = time.time()
result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600)
elapsed = time.time() - start
output = result.stdout + "\n" + result.stderr
ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
ppl = float(ppl_match.group(1)) if ppl_match else None
token_match = re.search(r"(\d+) tokens", output)
tokens = int(token_match.group(1)) if token_match else None
ppl_result = {
"kv_type": kv_type,
"perplexity": ppl,
"tokens": tokens,
"elapsed_seconds": round(elapsed, 1),
"exit_code": result.returncode,
"passed": result.returncode == 0,
"output_tail": output.strip()[-500:] if output else "",
}
suite["quality"] = ppl_result
if ppl is not None:
print(f" Perplexity ({kv_type}): {ppl:.4f}")
else:
print(f" Perplexity: FAILED — could not parse output")
except subprocess.TimeoutExpired:
suite["quality"] = {"perplexity": None, "passed": False, "error": "Timeout after 3600s"}
print(" Perplexity: FAILED — timeout after 3600s")
except Exception as e:
suite["quality"] = {"perplexity": None, "passed": False, "error": str(e)}
print(f" Perplexity: FAILED — {e}")
print("="*60)
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as fh:
json.dump(suite, fh, indent=2)
s = suite["summary"]
print(f"\n{'='*60}")
print(f"RESULTS: {s['success']}/{s['total']} success | "
f"Avg {s['avg_tok_per_sec']:.1f} tok/s | "
f"Avg {s['avg_latency_s']:.2f}s latency")
if "quality" in suite:
q = suite["quality"]
if q.get("perplexity") is not None:
print(f"Quality: PPL = {q['perplexity']:.4f}")
else:
print(f"Quality: not available — {q.get('error','unknown')}")
print(f"{'='*60}")
print(f"Saved to {output_file}")
def main():
parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite")
parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama")
parser.add_argument("--model", required=True, help="Model name or path")
parser.add_argument("--url", default="http://localhost:11434", help="Backend URL")
parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)")
parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file")
parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)")
parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)")
# Issue #63: Quality measurement (Ollama lacks logprob → use llama-perplexity binary)
parser.add_argument("--quality", action="store_true", default=False,
help="Also run quality measurement via llama-perplexity binary")
parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
help="Path to llama-perplexity binary")
parser.add_argument("--quality-corpus", default="corpora/wiki.test.raw",
help="Test corpus for perplexity measurement")
parser.add_argument("--context", type=int, default=2048,
help="Context length for quality measurement")
parser.add_argument("--threads", type=int, default=4,
help="Thread count for quality measurement")
args = parser.parse_args()
if args.output is None:
ts = int(time.time())
args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json"
run_benchmark_suite(
backend=args.backend,
model=args.model,
url=args.url,
kv_type=args.kv_type,
prompts_file=args.prompts,
output_file=args.output,
timeout=args.timeout,
measure_quality=args.quality,
quality_corpus=args.quality_corpus,
llama_cpp_bin=args.llama_cpp,
context=args.context,
threads=args.threads,
)
if __name__ == "__main__":
main()