#!/usr/bin/env python3 """ TurboQuant Benchmarking Suite — Multi-Backend (Issue #29) Supports Ollama and llama-server backends with KV cache type configuration. Measures: TTFT, tokens/sec, latency, peak memory. Usage: # Ollama (default) python3 benchmarks/run_benchmarks.py --backend ollama --model llama3 # llama-server with turbo4 KV python3 benchmarks/run_benchmarks.py --backend llama-server \ --url http://localhost:11434 --model qwen3.5 --kv-type turbo4 """ import argparse import json import os import re import subprocess import sys import time from datetime import datetime, timezone from typing import List, Dict, Optional import requests def get_peak_memory_mb() -> float: """Get peak RSS of current process in MB (macOS/Linux).""" try: if sys.platform == "darwin": result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())], capture_output=True, text=True) return int(result.stdout.strip()) / 1024 else: with open(f"/proc/{os.getpid()}/status") as f: for line in f: if line.startswith("VmHWM:"): return int(line.split()[1]) / 1024 except Exception: pass return 0.0 def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict: """Run a prompt against Ollama /api/generate.""" api_url = f"{url.rstrip('/')}/api/generate" start = time.time() ttft = None tokens_per_sec = 0.0 try: resp = requests.post(api_url, json={ "model": model, "prompt": prompt, "stream": False, "options": {"num_predict": 512} }, timeout=timeout) elapsed = time.time() - start resp.raise_for_status() data = resp.json() response_text = data.get("response", "") eval_count = data.get("eval_count", 0) eval_duration_ns = data.get("eval_duration", 0) prompt_eval_ns = data.get("prompt_eval_duration", 0) if eval_duration_ns > 0: tokens_per_sec = eval_count / (eval_duration_ns / 1e9) if prompt_eval_ns > 0: ttft = prompt_eval_ns / 1e9 return { "response": response_text, "latency_s": round(elapsed, 3), "ttft_s": round(ttft, 3) if ttft else None, "tokens_per_sec": round(tokens_per_sec, 2), "eval_count": eval_count, "status": "success" } except Exception as e: return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16", timeout: int = 120) -> dict: """Run a prompt against llama-server OpenAI-compatible API.""" api_url = f"{url.rstrip('/')}/v1/chat/completions" start = time.time() ttft = None tokens_per_sec = 0.0 try: resp = requests.post(api_url, json={ "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 512, "stream": False }, timeout=timeout) elapsed = time.time() - start resp.raise_for_status() data = resp.json() response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") usage = data.get("usage", {}) completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) # llama-server includes timing in x_* headers or we estimate if elapsed > 0 and completion_tokens > 0: # Subtract estimated prompt eval time (rough) tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01) return { "response": response_text, "latency_s": round(elapsed, 3), "ttft_s": round(ttft, 3) if ttft else None, "tokens_per_sec": round(tokens_per_sec, 2), "completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "kv_type": kv_type, "status": "success" } except Exception as e: return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, prompts_file: str, output_file: str, timeout: int = 120): """Run the full benchmark suite.""" if not os.path.exists(prompts_file): print(f"ERROR: {prompts_file} not found") sys.exit(1) with open(prompts_file) as f: prompts = json.load(f) run_fn = run_ollama if backend == "ollama" else run_llama_server mem_before = get_peak_memory_mb() results = [] print(f"\n{'='*60}") print(f"Backend: {backend} | Model: {model} | KV: {kv_type}") print(f"URL: {url}") print(f"Prompts: {len(prompts)} | Output: {output_file}") print(f"{'='*60}\n") for item in prompts: pid = item.get("id", item.get("category", "unknown")) prompt = item["prompt"] print(f"[{pid}] Running...", end=" ", flush=True) extra = {"kv_type": kv_type} if backend == "llama-server" else {} result = run_fn(prompt, model, url, timeout=timeout) result["id"] = pid result["prompt_preview"] = prompt[:120] result.update(extra) status = "✓" if result["status"] == "success" else "✗" tps = result.get("tokens_per_sec", 0) lat = result.get("latency_s", 0) print(f"{status} {tps:.1f} tok/s, {lat:.2f}s") results.append(result) mem_after = get_peak_memory_mb() suite = { "timestamp": datetime.now(timezone.utc).isoformat(), "backend": backend, "model": model, "kv_type": kv_type, "url": url, "prompts_file": prompts_file, "memory_mb": round(max(mem_before, mem_after), 1), "results": results, "summary": { "total": len(results), "success": sum(1 for r in results if r["status"] == "success"), "failed": sum(1 for r in results if r["status"] == "failed"), "avg_tok_per_sec": round( sum(r.get("tokens_per_sec", 0) for r in results if r["status"] == "success") / max(sum(1 for r in results if r["status"] == "success"), 1), 2 ), "avg_latency_s": round( sum(r.get("latency_s", 0) for r in results if r["status"] == "success") / max(sum(1 for r in results if r["status"] == "success"), 1), 3 ), } } os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) with open(output_file, "w") as f: json.dump(suite, f, indent=2) s = suite["summary"] print(f"\n{'='*60}") print(f"RESULTS: {s['success']}/{s['total']} success | " f"Avg {s['avg_tok_per_sec']:.1f} tok/s | " f"Avg {s['avg_latency_s']:.2f}s latency") print(f"{'='*60}") print(f"Saved to {output_file}") def main(): parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite") parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama") parser.add_argument("--model", required=True, help="Model name") parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)") parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file") parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)") parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)") args = parser.parse_args() if args.output is None: ts = int(time.time()) args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json" run_benchmark_suite(args.backend, args.model, args.url, args.kv_type, args.prompts, args.output, args.timeout) if __name__ == "__main__": main()