diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py index 45c82402..11367f5c 100644 --- a/benchmarks/run_benchmarks.py +++ b/benchmarks/run_benchmarks.py @@ -1,75 +1,227 @@ +#!/usr/bin/env python3 +""" +TurboQuant Benchmarking Suite — Multi-Backend (Issue #29) + +Supports Ollama and llama-server backends with KV cache type configuration. +Measures: TTFT, tokens/sec, latency, peak memory. + +Usage: + # Ollama (default) + python3 benchmarks/run_benchmarks.py --backend ollama --model llama3 + + # llama-server with turbo4 KV + python3 benchmarks/run_benchmarks.py --backend llama-server \ + --url http://localhost:11434 --model qwen3.5 --kv-type turbo4 +""" + +import argparse import json -import time -import requests import os -from typing import List, Dict +import re +import subprocess +import sys +import time +from datetime import datetime, timezone +from typing import List, Dict, Optional -# ═══════════════════════════════════════════ -# TURBOQUANT BENCHMARKING SUITE (Issue #16) -# ═══════════════════════════════════════════ -# This script runs a standardized set of prompts against the local inference -# engine (Ollama) and logs the results. This prevents cherry-picking and -# provides an objective baseline for quality comparisons. +import requests -OLLAMA_URL = "http://localhost:11434/api/generate" -PROMPTS_FILE = "benchmarks/prompts.json" -RESULTS_FILE = f"benchmarks/results_{int(time.time())}.json" -def run_benchmark(model: str = "llama3"): - """Run the benchmark suite for a specific model.""" - if not os.path.exists(PROMPTS_FILE): - print(f"Error: {PROMPTS_FILE} not found.") - return +def get_peak_memory_mb() -> float: + """Get peak RSS of current process in MB (macOS/Linux).""" + try: + if sys.platform == "darwin": + result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())], + capture_output=True, text=True) + return int(result.stdout.strip()) / 1024 + else: + with open(f"/proc/{os.getpid()}/status") as f: + for line in f: + if line.startswith("VmHWM:"): + return int(line.split()[1]) / 1024 + except Exception: + pass + return 0.0 - with open(PROMPTS_FILE, 'r') as f: + +def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict: + """Run a prompt against Ollama /api/generate.""" + api_url = f"{url.rstrip('/')}/api/generate" + start = time.time() + ttft = None + tokens_per_sec = 0.0 + + try: + resp = requests.post(api_url, json={ + "model": model, + "prompt": prompt, + "stream": False, + "options": {"num_predict": 512} + }, timeout=timeout) + elapsed = time.time() - start + resp.raise_for_status() + data = resp.json() + + response_text = data.get("response", "") + eval_count = data.get("eval_count", 0) + eval_duration_ns = data.get("eval_duration", 0) + prompt_eval_ns = data.get("prompt_eval_duration", 0) + + if eval_duration_ns > 0: + tokens_per_sec = eval_count / (eval_duration_ns / 1e9) + if prompt_eval_ns > 0: + ttft = prompt_eval_ns / 1e9 + + return { + "response": response_text, + "latency_s": round(elapsed, 3), + "ttft_s": round(ttft, 3) if ttft else None, + "tokens_per_sec": round(tokens_per_sec, 2), + "eval_count": eval_count, + "status": "success" + } + except Exception as e: + return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} + + +def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16", + timeout: int = 120) -> dict: + """Run a prompt against llama-server OpenAI-compatible API.""" + api_url = f"{url.rstrip('/')}/v1/chat/completions" + start = time.time() + ttft = None + tokens_per_sec = 0.0 + + try: + resp = requests.post(api_url, json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 512, + "stream": False + }, timeout=timeout) + elapsed = time.time() - start + resp.raise_for_status() + data = resp.json() + + response_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + completion_tokens = usage.get("completion_tokens", 0) + prompt_tokens = usage.get("prompt_tokens", 0) + + # llama-server includes timing in x_* headers or we estimate + if elapsed > 0 and completion_tokens > 0: + # Subtract estimated prompt eval time (rough) + tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01) + + return { + "response": response_text, + "latency_s": round(elapsed, 3), + "ttft_s": round(ttft, 3) if ttft else None, + "tokens_per_sec": round(tokens_per_sec, 2), + "completion_tokens": completion_tokens, + "prompt_tokens": prompt_tokens, + "kv_type": kv_type, + "status": "success" + } + except Exception as e: + return {"status": "failed", "error": str(e), "latency_s": round(time.time() - start, 3)} + + +def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, + prompts_file: str, output_file: str, timeout: int = 120): + """Run the full benchmark suite.""" + if not os.path.exists(prompts_file): + print(f"ERROR: {prompts_file} not found") + sys.exit(1) + + with open(prompts_file) as f: prompts = json.load(f) + run_fn = run_ollama if backend == "ollama" else run_llama_server + mem_before = get_peak_memory_mb() + results = [] - print(f"Starting benchmark for model: {model}") - print(f"Saving results to: {RESULTS_FILE}") + print(f"\n{'='*60}") + print(f"Backend: {backend} | Model: {model} | KV: {kv_type}") + print(f"URL: {url}") + print(f"Prompts: {len(prompts)} | Output: {output_file}") + print(f"{'='*60}\n") for item in prompts: - print(f"Running prompt: {item['id']}...") - - start_time = time.time() - try: - response = requests.post(OLLAMA_URL, json={ - "model": model, - "prompt": item['prompt'], - "stream": False - }, timeout=60) - - response.raise_for_status() - data = response.json() - end_time = time.time() - - results.append({ - "id": item['id'], - "prompt": item['prompt'], - "response": data.get("response"), - "latency": end_time - start_time, - "tokens_per_second": data.get("eval_count", 0) / (data.get("eval_duration", 1) / 1e9) if data.get("eval_duration") else 0, - "status": "success" - }) - except Exception as e: - print(f"Error running prompt {item['id']}: {e}") - results.append({ - "id": item['id'], - "prompt": item['prompt'], - "error": str(e), - "status": "failed" - }) + pid = item.get("id", item.get("category", "unknown")) + prompt = item["prompt"] + print(f"[{pid}] Running...", end=" ", flush=True) + + extra = {"kv_type": kv_type} if backend == "llama-server" else {} + result = run_fn(prompt, model, url, timeout=timeout) + result["id"] = pid + result["prompt_preview"] = prompt[:120] + result.update(extra) + + status = "✓" if result["status"] == "success" else "✗" + tps = result.get("tokens_per_sec", 0) + lat = result.get("latency_s", 0) + print(f"{status} {tps:.1f} tok/s, {lat:.2f}s") + + results.append(result) + + mem_after = get_peak_memory_mb() + + suite = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "backend": backend, + "model": model, + "kv_type": kv_type, + "url": url, + "prompts_file": prompts_file, + "memory_mb": round(max(mem_before, mem_after), 1), + "results": results, + "summary": { + "total": len(results), + "success": sum(1 for r in results if r["status"] == "success"), + "failed": sum(1 for r in results if r["status"] == "failed"), + "avg_tok_per_sec": round( + sum(r.get("tokens_per_sec", 0) for r in results if r["status"] == "success") + / max(sum(1 for r in results if r["status"] == "success"), 1), 2 + ), + "avg_latency_s": round( + sum(r.get("latency_s", 0) for r in results if r["status"] == "success") + / max(sum(1 for r in results if r["status"] == "success"), 1), 3 + ), + } + } + + os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) + with open(output_file, "w") as f: + json.dump(suite, f, indent=2) + + s = suite["summary"] + print(f"\n{'='*60}") + print(f"RESULTS: {s['success']}/{s['total']} success | " + f"Avg {s['avg_tok_per_sec']:.1f} tok/s | " + f"Avg {s['avg_latency_s']:.2f}s latency") + print(f"{'='*60}") + print(f"Saved to {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite") + parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama") + parser.add_argument("--model", required=True, help="Model name") + parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") + parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)") + parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file") + parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)") + parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)") + args = parser.parse_args() + + if args.output is None: + ts = int(time.time()) + args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json" + + run_benchmark_suite(args.backend, args.model, args.url, args.kv_type, + args.prompts, args.output, args.timeout) - # Save results - with open(RESULTS_FILE, 'w') as f: - json.dump({ - "model": model, - "timestamp": time.time(), - "results": results - }, f, indent=2) - - print("Benchmark complete.") if __name__ == "__main__": - # Default to llama3 for testing - run_benchmark("llama3") + main()