#!/usr/bin/env python3 """ TurboQuant Benchmark Comparison (Issue #29). Runs multiple inference configurations and produces a side-by-side comparison table with TTFT, tokens/sec, and peak memory. Configurations (default): 1. Ollama gemma4 (baseline) 2. llama-server gemma4 f16 KV 3. llama-server gemma4 turbo4 KV 4. llama-server gemma4 turbo4 + layer-adaptive Usage: python3 benchmarks/compare_configs.py --help python3 benchmarks/compare_configs.py --config benchmarks/configs.json python3 benchmarks/compare_configs.py --demo """ import argparse import json import os import sys import time from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from pathlib import Path from typing import Optional # Ensure we can import sibling run_benchmarks sys.path.insert(0, str(Path(__file__).resolve().parent)) try: from run_benchmarks import ( run_ollama, run_llama_server, get_peak_memory_mb, ) except ImportError: # Fallback stubs when run_benchmarks (and requests) are unavailable def run_ollama(prompt, model, url, timeout=120): # type: ignore return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0} def run_llama_server(prompt, model, url, kv_type="f16", timeout=120): # type: ignore return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0} def get_peak_memory_mb(): # type: ignore return 0.0 # --------------------------------------------------------------------------- # Data structures # --------------------------------------------------------------------------- @dataclass class ConfigEntry: """One inference configuration to benchmark.""" name: str backend: str # "ollama" | "llama-server" model: str url: str kv_type: str = "f16" layer_adaptive: bool = False env: dict = field(default_factory=dict) def to_dict(self) -> dict: return asdict(self) @dataclass class ConfigResult: """Aggregated results for a single configuration.""" config_name: str backend: str model: str kv_type: str total_prompts: int success: int failed: int avg_ttft_s: Optional[float] avg_tok_per_sec: float avg_latency_s: float peak_memory_mb: float winner: bool = False def to_dict(self) -> dict: return asdict(self) # --------------------------------------------------------------------------- # Default configurations # --------------------------------------------------------------------------- DEFAULT_CONFIGS: list[ConfigEntry] = [ ConfigEntry(name="ollama-gemma4", backend="ollama", model="gemma4", url="http://localhost:11434", kv_type="default"), ConfigEntry(name="llama-f16", backend="llama-server", model="gemma4", url="http://localhost:8081", kv_type="f16"), ConfigEntry(name="llama-turbo4", backend="llama-server", model="gemma4", url="http://localhost:8081", kv_type="turbo4"), ConfigEntry(name="llama-turbo4-adaptive", backend="llama-server", model="gemma4", url="http://localhost:8081", kv_type="turbo4", layer_adaptive=True), ] # --------------------------------------------------------------------------- # Core logic # --------------------------------------------------------------------------- def load_prompts(prompts_file: str) -> list[dict]: """Load test prompts from JSON file.""" with open(prompts_file) as f: return json.load(f) def run_config(config: ConfigEntry, prompts: list[dict], timeout: int = 120) -> list[dict]: """Run all prompts against a single configuration, return per-prompt results.""" results = [] env_overrides = {**os.environ, **config.env} if config.layer_adaptive: env_overrides.setdefault("TURBO_LAYER_ADAPTIVE", "7") for item in prompts: if config.backend == "ollama": result = run_ollama(item["prompt"], config.model, config.url, timeout) else: result = run_llama_server(item["prompt"], config.model, config.url, kv_type=config.kv_type, timeout=timeout) result["id"] = item.get("id", item.get("category", "unknown")) result["prompt_preview"] = item["prompt"][:120] results.append(result) return results def aggregate(results: list[dict], config: ConfigEntry, peak_mb: float) -> ConfigResult: """Aggregate per-prompt results into a ConfigResult.""" successes = [r for r in results if r.get("status") == "success"] ttfts = [r["ttft_s"] for r in successes if r.get("ttft_s") is not None] tps = [r["tokens_per_sec"] for r in successes if r.get("tokens_per_sec")] lats = [r["latency_s"] for r in successes] return ConfigResult( config_name=config.name, backend=config.backend, model=config.model, kv_type=config.kv_type, total_prompts=len(results), success=len(successes), failed=len(results) - len(successes), avg_ttft_s=round(sum(ttfts) / len(ttfts), 3) if ttfts else None, avg_tok_per_sec=round(sum(tps) / len(tps), 2) if tps else 0.0, avg_latency_s=round(sum(lats) / len(lats), 3) if lats else 0.0, peak_memory_mb=peak_mb, ) def build_comparison_table(aggregated: list[ConfigResult]) -> str: """Build a human-readable comparison table.""" lines = [] header = f"{'Config':<28} {'TTFT':<8} {'tok/s':<10} {'lat(s)':<8} {'mem(MB)':<9} {'ok/n':<6}" lines.append(header) lines.append("-" * len(header)) for r in aggregated: marker = " <- WINNER" if r.winner else "" ttft = f"{r.avg_ttft_s:.3f}" if r.avg_ttft_s is not None else "N/A" lines.append( f"{r.config_name:<28} {ttft:<8} {r.avg_tok_per_sec:<10.2f} " f"{r.avg_latency_s:<8.3f} {r.peak_memory_mb:<9.1f} " f"{r.success}/{r.total_prompts}{marker}" ) return "\n".join(lines) def pick_winner(aggregated: list[ConfigResult]) -> ConfigResult: """Choose the winner: highest tokens/sec among successful configs.""" candidates = [r for r in aggregated if r.success > 0] if not candidates: return aggregated[0] if aggregated else ConfigResult( config_name="none", backend="", model="", kv_type="", total_prompts=0, success=0, failed=0, avg_ttft_s=None, avg_tok_per_sec=0.0, avg_latency_s=0.0, peak_memory_mb=0.0, ) winner = max(candidates, key=lambda r: r.avg_tok_per_sec) winner.winner = True return winner def run_comparison(configs: list[ConfigEntry], prompts: list[dict], output_file: Optional[str] = None, timeout: int = 120) -> dict: """Run full comparison and return structured report.""" all_results: list[ConfigResult] = [] for cfg in configs: print(f"\n--- {cfg.name} ({cfg.backend}/{cfg.kv_type}) ---") per_prompt = run_config(cfg, prompts, timeout) peak_mb = get_peak_memory_mb() agg = aggregate(per_prompt, cfg, peak_mb) all_results.append(agg) winner = pick_winner(all_results) table = build_comparison_table(all_results) report = { "timestamp": datetime.now(timezone.utc).isoformat(), "prompts_count": len(prompts), "winner": winner.config_name, "winner_tok_per_sec": winner.avg_tok_per_sec, "configs": [r.to_dict() for r in all_results], "table": table, } print(f"\n{table}") print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)") if output_file: os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) with open(output_file, "w") as f: json.dump(report, f, indent=2) print(f"Report saved to {output_file}") return report # --------------------------------------------------------------------------- # Demo mode (no live servers required) # --------------------------------------------------------------------------- def run_demo(output_file: Optional[str] = None) -> dict: """Generate synthetic benchmark results for testing.""" import random random.seed(42) # Simulated performance baselines baselines = { "ollama-gemma4": {"ttft": 0.85, "tps": 18.2, "mem": 2200}, "llama-f16": {"ttft": 0.72, "tps": 22.1, "mem": 2400}, "llama-turbo4": {"ttft": 0.68, "tps": 19.8, "mem": 850}, "llama-turbo4-adaptive": {"ttft": 0.65, "tps": 20.5, "mem": 820}, } all_results: list[ConfigResult] = [] for cfg in DEFAULT_CONFIGS: bl = baselines[cfg.name] prompt_count = 10 ttft = bl["ttft"] + random.gauss(0, 0.02) tps = bl["tps"] + random.gauss(0, 0.5) lat = (ttft + 512 / tps) + random.gauss(0, 0.1) agg = ConfigResult( config_name=cfg.name, backend=cfg.backend, model=cfg.model, kv_type=cfg.kv_type, total_prompts=prompt_count, success=prompt_count, failed=0, avg_ttft_s=round(ttft, 3), avg_tok_per_sec=round(tps, 2), avg_latency_s=round(lat, 3), peak_memory_mb=bl["mem"] + random.gauss(0, 50), ) all_results.append(agg) winner = pick_winner(all_results) table = build_comparison_table(all_results) report = { "timestamp": datetime.now(timezone.utc).isoformat(), "prompts_count": 10, "mode": "demo", "winner": winner.config_name, "winner_tok_per_sec": winner.avg_tok_per_sec, "configs": [r.to_dict() for r in all_results], "table": table, } print(f"\n{table}") print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)") if output_file: os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) with open(output_file, "w") as f: json.dump(report, f, indent=2) print(f"Report saved to {output_file}") return report # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="TurboQuant multi-config benchmark comparison") parser.add_argument("--config", type=str, help="JSON file with custom configurations") parser.add_argument("--prompts", type=str, default="benchmarks/test_prompts.json", help="Path to test prompts JSON") parser.add_argument("--output", type=str, default=None, help="Output file for JSON report") parser.add_argument("--timeout", type=int, default=120, help="Timeout per prompt in seconds") parser.add_argument("--demo", action="store_true", help="Run with synthetic data (no servers)") args = parser.parse_args() if args.demo: run_demo(args.output) return # Load configs if args.config: with open(args.config) as f: raw = json.load(f) configs = [ConfigEntry(**c) for c in raw] else: configs = DEFAULT_CONFIGS # Load prompts prompts = load_prompts(args.prompts) run_comparison(configs, prompts, args.output, args.timeout) if __name__ == "__main__": main()