turboquant/benchmarks/compare_configs.py

#!/usr/bin/env python3
"""
TurboQuant Benchmark Comparison (Issue #29).

Runs multiple inference configurations and produces a side-by-side
comparison table with TTFT, tokens/sec, and peak memory.

Configurations (default):
  1. Ollama gemma4 (baseline)
  2. llama-server gemma4 f16 KV
  3. llama-server gemma4 turbo4 KV
  4. llama-server gemma4 turbo4 + layer-adaptive

Usage:
    python3 benchmarks/compare_configs.py --help
    python3 benchmarks/compare_configs.py --config benchmarks/configs.json
    python3 benchmarks/compare_configs.py --demo
"""

import argparse
import json
import os
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Ensure we can import sibling run_benchmarks
sys.path.insert(0, str(Path(__file__).resolve().parent))

try:
    from run_benchmarks import (
        run_ollama,
        run_llama_server,
        get_peak_memory_mb,
    )
except ImportError:
    # Fallback stubs when run_benchmarks (and requests) are unavailable
    def run_ollama(prompt, model, url, timeout=120):  # type: ignore
        return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}

    def run_llama_server(prompt, model, url, kv_type="f16", timeout=120):  # type: ignore
        return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}

    def get_peak_memory_mb():  # type: ignore
        return 0.0


# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------

@dataclass
class ConfigEntry:
    """One inference configuration to benchmark."""
    name: str
    backend: str  # "ollama" | "llama-server"
    model: str
    url: str
    kv_type: str = "f16"
    layer_adaptive: bool = False
    env: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class ConfigResult:
    """Aggregated results for a single configuration."""
    config_name: str
    backend: str
    model: str
    kv_type: str
    total_prompts: int
    success: int
    failed: int
    avg_ttft_s: Optional[float]
    avg_tok_per_sec: float
    avg_latency_s: float
    peak_memory_mb: float
    winner: bool = False

    def to_dict(self) -> dict:
        return asdict(self)


# ---------------------------------------------------------------------------
# Default configurations
# ---------------------------------------------------------------------------

DEFAULT_CONFIGS: list[ConfigEntry] = [
    ConfigEntry(name="ollama-gemma4", backend="ollama", model="gemma4",
                url="http://localhost:11434", kv_type="default"),
    ConfigEntry(name="llama-f16", backend="llama-server", model="gemma4",
                url="http://localhost:8081", kv_type="f16"),
    ConfigEntry(name="llama-turbo4", backend="llama-server", model="gemma4",
                url="http://localhost:8081", kv_type="turbo4"),
    ConfigEntry(name="llama-turbo4-adaptive", backend="llama-server",
                model="gemma4", url="http://localhost:8081",
                kv_type="turbo4", layer_adaptive=True),
]


# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------

def load_prompts(prompts_file: str) -> list[dict]:
    """Load test prompts from JSON file."""
    with open(prompts_file) as f:
        return json.load(f)


def run_config(config: ConfigEntry, prompts: list[dict], timeout: int = 120) -> list[dict]:
    """Run all prompts against a single configuration, return per-prompt results."""
    results = []
    env_overrides = {**os.environ, **config.env}
    if config.layer_adaptive:
        env_overrides.setdefault("TURBO_LAYER_ADAPTIVE", "7")

    for item in prompts:
        if config.backend == "ollama":
            result = run_ollama(item["prompt"], config.model, config.url, timeout)
        else:
            result = run_llama_server(item["prompt"], config.model, config.url,
                                      kv_type=config.kv_type, timeout=timeout)
        result["id"] = item.get("id", item.get("category", "unknown"))
        result["prompt_preview"] = item["prompt"][:120]
        results.append(result)
    return results


def aggregate(results: list[dict], config: ConfigEntry, peak_mb: float) -> ConfigResult:
    """Aggregate per-prompt results into a ConfigResult."""
    successes = [r for r in results if r.get("status") == "success"]
    ttfts = [r["ttft_s"] for r in successes if r.get("ttft_s") is not None]
    tps = [r["tokens_per_sec"] for r in successes if r.get("tokens_per_sec")]
    lats = [r["latency_s"] for r in successes]

    return ConfigResult(
        config_name=config.name,
        backend=config.backend,
        model=config.model,
        kv_type=config.kv_type,
        total_prompts=len(results),
        success=len(successes),
        failed=len(results) - len(successes),
        avg_ttft_s=round(sum(ttfts) / len(ttfts), 3) if ttfts else None,
        avg_tok_per_sec=round(sum(tps) / len(tps), 2) if tps else 0.0,
        avg_latency_s=round(sum(lats) / len(lats), 3) if lats else 0.0,
        peak_memory_mb=peak_mb,
    )


def build_comparison_table(aggregated: list[ConfigResult]) -> str:
    """Build a human-readable comparison table."""
    lines = []
    header = f"{'Config':<28} {'TTFT':<8} {'tok/s':<10} {'lat(s)':<8} {'mem(MB)':<9} {'ok/n':<6}"
    lines.append(header)
    lines.append("-" * len(header))

    for r in aggregated:
        marker = " <- WINNER" if r.winner else ""
        ttft = f"{r.avg_ttft_s:.3f}" if r.avg_ttft_s is not None else "N/A"
        lines.append(
            f"{r.config_name:<28} {ttft:<8} {r.avg_tok_per_sec:<10.2f} "
            f"{r.avg_latency_s:<8.3f} {r.peak_memory_mb:<9.1f} "
            f"{r.success}/{r.total_prompts}{marker}"
        )
    return "\n".join(lines)


def pick_winner(aggregated: list[ConfigResult]) -> ConfigResult:
    """Choose the winner: highest tokens/sec among successful configs."""
    candidates = [r for r in aggregated if r.success > 0]
    if not candidates:
        return aggregated[0] if aggregated else ConfigResult(
            config_name="none", backend="", model="", kv_type="",
            total_prompts=0, success=0, failed=0,
            avg_ttft_s=None, avg_tok_per_sec=0.0, avg_latency_s=0.0,
            peak_memory_mb=0.0,
        )
    winner = max(candidates, key=lambda r: r.avg_tok_per_sec)
    winner.winner = True
    return winner


def run_comparison(configs: list[ConfigEntry], prompts: list[dict],
                   output_file: Optional[str] = None,
                   timeout: int = 120) -> dict:
    """Run full comparison and return structured report."""
    all_results: list[ConfigResult] = []

    for cfg in configs:
        print(f"\n--- {cfg.name} ({cfg.backend}/{cfg.kv_type}) ---")
        per_prompt = run_config(cfg, prompts, timeout)
        peak_mb = get_peak_memory_mb()
        agg = aggregate(per_prompt, cfg, peak_mb)
        all_results.append(agg)

    winner = pick_winner(all_results)
    table = build_comparison_table(all_results)

    report = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "prompts_count": len(prompts),
        "winner": winner.config_name,
        "winner_tok_per_sec": winner.avg_tok_per_sec,
        "configs": [r.to_dict() for r in all_results],
        "table": table,
    }

    print(f"\n{table}")
    print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")

    if output_file:
        os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
        with open(output_file, "w") as f:
            json.dump(report, f, indent=2)
        print(f"Report saved to {output_file}")

    return report


# ---------------------------------------------------------------------------
# Demo mode (no live servers required)
# ---------------------------------------------------------------------------

def run_demo(output_file: Optional[str] = None) -> dict:
    """Generate synthetic benchmark results for testing."""
    import random
    random.seed(42)

    # Simulated performance baselines
    baselines = {
        "ollama-gemma4":       {"ttft": 0.85, "tps": 18.2, "mem": 2200},
        "llama-f16":           {"ttft": 0.72, "tps": 22.1, "mem": 2400},
        "llama-turbo4":        {"ttft": 0.68, "tps": 19.8, "mem": 850},
        "llama-turbo4-adaptive": {"ttft": 0.65, "tps": 20.5, "mem": 820},
    }

    all_results: list[ConfigResult] = []
    for cfg in DEFAULT_CONFIGS:
        bl = baselines[cfg.name]
        prompt_count = 10
        ttft = bl["ttft"] + random.gauss(0, 0.02)
        tps = bl["tps"] + random.gauss(0, 0.5)
        lat = (ttft + 512 / tps) + random.gauss(0, 0.1)

        agg = ConfigResult(
            config_name=cfg.name,
            backend=cfg.backend,
            model=cfg.model,
            kv_type=cfg.kv_type,
            total_prompts=prompt_count,
            success=prompt_count,
            failed=0,
            avg_ttft_s=round(ttft, 3),
            avg_tok_per_sec=round(tps, 2),
            avg_latency_s=round(lat, 3),
            peak_memory_mb=bl["mem"] + random.gauss(0, 50),
        )
        all_results.append(agg)

    winner = pick_winner(all_results)
    table = build_comparison_table(all_results)

    report = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "prompts_count": 10,
        "mode": "demo",
        "winner": winner.config_name,
        "winner_tok_per_sec": winner.avg_tok_per_sec,
        "configs": [r.to_dict() for r in all_results],
        "table": table,
    }

    print(f"\n{table}")
    print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")

    if output_file:
        os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
        with open(output_file, "w") as f:
            json.dump(report, f, indent=2)
        print(f"Report saved to {output_file}")

    return report


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="TurboQuant multi-config benchmark comparison")
    parser.add_argument("--config", type=str,
                        help="JSON file with custom configurations")
    parser.add_argument("--prompts", type=str,
                        default="benchmarks/test_prompts.json",
                        help="Path to test prompts JSON")
    parser.add_argument("--output", type=str, default=None,
                        help="Output file for JSON report")
    parser.add_argument("--timeout", type=int, default=120,
                        help="Timeout per prompt in seconds")
    parser.add_argument("--demo", action="store_true",
                        help="Run with synthetic data (no servers)")

    args = parser.parse_args()

    if args.demo:
        run_demo(args.output)
        return

    # Load configs
    if args.config:
        with open(args.config) as f:
            raw = json.load(f)
        configs = [ConfigEntry(**c) for c in raw]
    else:
        configs = DEFAULT_CONFIGS

    # Load prompts
    prompts = load_prompts(args.prompts)
    run_comparison(configs, prompts, args.output, args.timeout)


if __name__ == "__main__":
    main()