turboquant/benchmarks/run_perplexity.py

#!/usr/bin/env python3
"""
TurboQuant Perplexity Quality Gate (Issue #21)

Compares text generation quality between f16 KV and turbo4 KV cache
configurations using llama.cpp's perplexity tool on the wikitext-2 corpus.

Usage:
    python3 benchmarks/run_perplexity.py \
        --model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \
        --llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
        --corpus corpora/wiki.test.raw \
        --context 2048

Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass.
"""

import argparse
import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone


def run_perplexity(llama_bin: str, model: str, corpus: str, context: int,
                   kv_type: str, threads: int = 4) -> dict:
    """Run llama-perplexity and parse the output."""
    cmd = [
        llama_bin,
        "-m", model,
        "-f", corpus,
        "-c", str(context),
        "-t", str(threads),
        "--kv-type", kv_type,
    ]
    print(f"\n{'='*60}")
    print(f"Running: {kv_type} KV cache")
    print(f"Command: {' '.join(cmd)}")
    print(f"{'='*60}\n")

    start = time.time()
    try:
        result = subprocess.run(
            cmd, capture_output=True, text=True, timeout=3600
        )
        elapsed = time.time() - start
        output = result.stdout + "\n" + result.stderr

        # Parse perplexity from output
        # llama-perplexity prints lines like:
        # perplexity: 12.3456 [...]
        ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
        ppl = float(ppl_match.group(1)) if ppl_match else None

        # Parse token count
        token_match = re.search(r"(\d+) tokens", output)
        tokens = int(token_match.group(1)) if token_match else None

        return {
            "kv_type": kv_type,
            "perplexity": ppl,
            "tokens": tokens,
            "elapsed_seconds": round(elapsed, 1),
            "exit_code": result.returncode,
            "passed": result.returncode == 0,
            "output_tail": output.strip()[-500:] if output else "",
        }
    except subprocess.TimeoutExpired:
        return {
            "kv_type": kv_type,
            "perplexity": None,
            "elapsed_seconds": 3600,
            "exit_code": -1,
            "passed": False,
            "error": "Timeout after 3600s",
        }
    except FileNotFoundError:
        return {
            "kv_type": kv_type,
            "perplexity": None,
            "elapsed_seconds": 0,
            "exit_code": -1,
            "passed": False,
            "error": f"Binary not found: {llama_bin}",
        }


def main():
    parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate")
    parser.add_argument("--model", required=True, help="Path to GGUF model file")
    parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
                        help="Path to llama-perplexity binary")
    parser.add_argument("--corpus", default="corpora/wiki.test.raw",
                        help="Path to wikitext-2 test corpus")
    parser.add_argument("--context", type=int, default=2048, help="Context length")
    parser.add_argument("--threads", type=int, default=4, help="Thread count")
    parser.add_argument("--output", default="benchmarks/perplexity_results.json",
                        help="Output results file")
    parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"],
                        help="KV cache types to test")
    parser.add_argument("--threshold", type=float, default=0.5,
                        help="Max acceptable PPL delta (turbo4 - baseline)")
    args = parser.parse_args()

    # Validate inputs
    for path in [args.model, args.corpus, args.llama_cpp]:
        if not os.path.exists(path):
            print(f"ERROR: Not found: {path}")
            sys.exit(1)

    results = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "model": os.path.basename(args.model),
        "corpus": args.corpus,
        "context_length": args.context,
        "threshold": args.threshold,
        "runs": {},
        "pass": None,
    }

    # Run each KV type
    for kv in args.kv_types:
        results["runs"][kv] = run_perplexity(
            args.llama_cpp, args.model, args.corpus,
            args.context, kv, args.threads
        )

    # Calculate delta and pass/fail
    baseline = results["runs"].get("f16", {})
    turbo = results["runs"].get("turbo4", {})

    if baseline.get("perplexity") and turbo.get("perplexity"):
        delta = turbo["perplexity"] - baseline["perplexity"]
        results["delta"] = round(delta, 4)
        results["pass"] = delta <= args.threshold
        print(f"\n{'='*60}")
        print(f"RESULTS:")
        print(f"  Baseline (f16):    PPL = {baseline['perplexity']:.4f}")
        print(f"  Turbo4:            PPL = {turbo['perplexity']:.4f}")
        print(f"  Delta:                   {delta:+.4f}")
        print(f"  Threshold:               ≤ {args.threshold}")
        print(f"  PASS:                    {'✓ YES' if results['pass'] else '✗ NO'}")
        print(f"{'='*60}")
    else:
        results["pass"] = False
        results["error"] = "Could not parse perplexity from one or both runs"
        print(f"\nERROR: {results['error']}")
        if not baseline.get("perplexity"):
            print(f"  f16 run output: {baseline.get('output_tail', 'N/A')}")
        if not turbo.get("perplexity"):
            print(f"  turbo4 run output: {turbo.get('output_tail', 'N/A')}")

    # Save results
    os.makedirs(os.path.dirname(args.output), exist_ok=True)
    with open(args.output, "w") as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to {args.output}")

    sys.exit(0 if results["pass"] else 1)


if __name__ == "__main__":
    main()