#!/usr/bin/env python3 """ TurboQuant Benchmarking Suite — Multi-Backend (Issue #29, #63) Supports Ollama and llama-server backends with KV cache type configuration. Measures: TTFT, tokens/sec, latency, peak memory. Perplexity (quality) is NOT measured here tokens/sec is a throughput proxy. For actual quality (logprob-based PPL), use --quality flag which delegates to llama-perplexity binary, since Ollama lacks logprob support (issue #63). Usage: # Ollama (efficiency only) python3 benchmarks/run_benchmarks.py --backend ollama --model llama3 # llama-server with turbo4 KV + quality gate in one shot python3 benchmarks/run_benchmarks.py --backend llama-server \ --url http://localhost:11434 --model qwen3.5 --kv-type turbo4 --quality # Quality gate only (separate tool) python3 benchmarks/run_perplexity.py --model ~/models/qwen3.5-27b.gguf \ --llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \ --corpus corpora/wiki.test.raw --context 2048 """ import argparse import json import os import re import subprocess import sys import time from datetime import datetime, timezone from typing import List, Dict, Optional import requests def get_peak_memory_mb() -> float: """Get peak RSS of current process in MB (macOS/Linux).""" try: if sys.platform == "darwin": result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())], capture_output=True, text=True) return int(result.stdout.strip()) / 1024 else: with open(f"/proc/{os.getpid()}/status") as f: for line in f: if line.startswith("VmHWM:"): return int(line.split()[1]) / 1024 except Exception: pass return 0.0 def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict: """Run a prompt against Ollama /api/generate.""" api_url = f"{url.rstrip('/')}/api/generate" start = time.time() ttft = None tokens_per_sec = 0.0 try: resp = requests.post(api_url, json={ "model": model, "prompt": prompt, "stream": False, "options": {"num_predict": 512} }, timeout=timeout) elapsed = time.time() - start resp.raise_for_status() data = resp.json() response_text = data.get("response", "") eval_count = data.get("eval_count", 0) eval_duration_ns = data.get("eval_duration", 0) prompt_eval_ns = data.get("prompt_eval_duration", 0) if eval_duration_ns > 0: tokens_per_sec = eval_count / (eval_duration_ns / 1e9) if prompt_eval_ns > 0: ttft = prompt_eval_ns / 1e9 return { "response": response_text, "latency_s": round(elapsed, 3), "ttft_s": round(ttft, 3) if ttft else None, "tokens_per_sec": round(tokens_per_sec, 2), "eval_count": eval_count, "status": "success" } except Exception as e: return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16", timeout: int = 120) -> dict: """Run a prompt against llama-server OpenAI-compatible API.""" api_url = f"{url.rstrip('/')}/v1/chat/completions" start = time.time() ttft = None tokens_per_sec = 0.0 try: resp = requests.post(api_url, json={ "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 512, "stream": False }, timeout=timeout) elapsed = time.time() - start resp.raise_for_status() data = resp.json() response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") usage = data.get("usage", {}) completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) if elapsed > 0 and completion_tokens > 0: tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01) return { "response": response_text, "latency_s": round(elapsed, 3), "ttft_s": round(ttft, 3) if ttft else None, "tokens_per_sec": round(tokens_per_sec, 2), "completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "kv_type": kv_type, "status": "success" } except Exception as e: return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, prompts_file: str, output_file: str, timeout: int = 120, measure_quality: bool = False, quality_corpus: str = None, llama_cpp_bin: str = None, context: int = 2048, threads: int = 4): """Run the full benchmark suite, optionally measuring perplexity in parallel.""" if not os.path.exists(prompts_file): print(f"ERROR: {prompts_file} not found") sys.exit(1) with open(prompts_file) as f: prompts = json.load(f) run_fn = run_ollama if backend == "ollama" else run_llama_server mem_before = get_peak_memory_mb() results = [] print(f"\n{'='*60}") print(f"Backend: {backend} | Model: {model} | KV: {kv_type}") print(f"URL: {url}") print(f"Prompts: {len(prompts)} | Output: {output_file}") print(f"{'='*60}\n") for item in prompts: pid = item.get("id", item.get("category", "unknown")) prompt = item["prompt"] print(f"[{pid}] Running...", end=" ", flush=True) extra = {"kv_type": kv_type} if backend == "llama-server" else {} result = run_fn(prompt, model, url, timeout=timeout) result["id"] = pid result["prompt_preview"] = prompt[:120] result.update(extra) status = "✓" if result["status"] == "success" else "✗" tps = result.get("tokens_per_sec", 0) lat = result.get("latency_s", 0) print(f"{status} {tps:.1f} tok/s, {lat:.2f}s") results.append(result) mem_after = get_peak_memory_mb() suite = { "timestamp": datetime.now(timezone.utc).isoformat(), "backend": backend, "model": model, "kv_type": kv_type, "url": url, "prompts_file": prompts_file, "memory_mb": round(max(mem_before, mem_after), 1), "results": results, "summary": { "total": len(results), "success": sum(1 for r in results if r["status"] == "success"), "failed": sum(1 for r in results if r["status"] == "failed"), "avg_tok_per_sec": round( sum(r.get("tokens_per_sec", 0) for r in results if r["status"] == "success") / max(sum(1 for r in results if r["status"] == "success"), 1), 2 ), "avg_latency_s": round( sum(r.get("latency_s", 0) for r in results if r["status"] == "success") / max(sum(1 for r in results if r["status"] == "success"), 1), 3 ), } } # Issue #63: Optional quality measurement via llama-perplexity (Ollama lacks logprob) if measure_quality: print("\n" + "="*60) print("Quality measurement requested — invoking llama-perplexity binary...") llama_cpp_bin = llama_cpp_bin or "llama.cpp-fork/build/bin/llama-perplexity" quality_corpus = quality_corpus or "corpora/wiki.test.raw" if not os.path.exists(quality_corpus): print(f"WARNING: quality corpus not found: {quality_corpus}") suite["quality"] = {"perplexity": None, "passed": False, "error": f"Corpus missing: {quality_corpus}"} elif not os.path.exists(llama_cpp_bin): print(f"WARNING: llama-perplexity binary not found: {llama_cpp_bin}") suite["quality"] = {"perplexity": None, "passed": False, "error": f"Binary missing: {llama_cpp_bin}"} else: cmd = [ llama_cpp_bin, "-m", model, "-f", quality_corpus, "-c", str(context), "-t", str(threads), "--kv-type", kv_type, ] try: start = time.time() result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) elapsed = time.time() - start output = result.stdout + "\n" + result.stderr ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE) ppl = float(ppl_match.group(1)) if ppl_match else None token_match = re.search(r"(\d+) tokens", output) tokens = int(token_match.group(1)) if token_match else None ppl_result = { "kv_type": kv_type, "perplexity": ppl, "tokens": tokens, "elapsed_seconds": round(elapsed, 1), "exit_code": result.returncode, "passed": result.returncode == 0, "output_tail": output.strip()[-500:] if output else "", } suite["quality"] = ppl_result if ppl is not None: print(f" Perplexity ({kv_type}): {ppl:.4f}") else: print(f" Perplexity: FAILED — could not parse output") except subprocess.TimeoutExpired: suite["quality"] = {"perplexity": None, "passed": False, "error": "Timeout after 3600s"} print(" Perplexity: FAILED — timeout after 3600s") except Exception as e: suite["quality"] = {"perplexity": None, "passed": False, "error": str(e)} print(f" Perplexity: FAILED — {e}") print("="*60) os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) with open(output_file, "w") as fh: json.dump(suite, fh, indent=2) s = suite["summary"] print(f"\n{'='*60}") print(f"RESULTS: {s['success']}/{s['total']} success | " f"Avg {s['avg_tok_per_sec']:.1f} tok/s | " f"Avg {s['avg_latency_s']:.2f}s latency") if "quality" in suite: q = suite["quality"] if q.get("perplexity") is not None: print(f"Quality: PPL = {q['perplexity']:.4f}") else: print(f"Quality: not available — {q.get('error','unknown')}") print(f"{'='*60}") print(f"Saved to {output_file}") def main(): parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite") parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama") parser.add_argument("--model", required=True, help="Model name or path") parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)") parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file") parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)") parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)") # Issue #63: Quality measurement (Ollama lacks logprob → use llama-perplexity binary) parser.add_argument("--quality", action="store_true", default=False, help="Also run quality measurement via llama-perplexity binary") parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity", help="Path to llama-perplexity binary") parser.add_argument("--quality-corpus", default="corpora/wiki.test.raw", help="Test corpus for perplexity measurement") parser.add_argument("--context", type=int, default=2048, help="Context length for quality measurement") parser.add_argument("--threads", type=int, default=4, help="Thread count for quality measurement") args = parser.parse_args() if args.output is None: ts = int(time.time()) args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json" run_benchmark_suite( backend=args.backend, model=args.model, url=args.url, kv_type=args.kv_type, prompts_file=args.prompts, output_file=args.output, timeout=args.timeout, measure_quality=args.quality, quality_corpus=args.quality_corpus, llama_cpp_bin=args.llama_cpp, context=args.context, threads=args.threads, ) if __name__ == "__main__": main()