feat: wikitext-2 corpus + perplexity benchmark script (closes #21)

- Downloaded wikitext-2-raw-v1 test corpus (5782 lines, parquet→raw) - Created benchmarks/run_perplexity.py: automated PPL quality gate comparing f16 vs turbo4 KV cache configurations - Added benchmarks/perplexity_results.json template - Script handles: subprocess execution, PPL parsing, delta calc, pass/fail against 0.5 threshold, JSON output Usage: python3 benchmarks/run_perplexity.py --model <gguf> --llama-cpp <binary>
2026-04-12 00:39:14 -04:00
parent 4c926312df
commit e4f15254b3
3 changed files with 5979 additions and 0 deletions
--- a/benchmarks/run_perplexity.py
+++ b/benchmarks/run_perplexity.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+TurboQuant Perplexity Quality Gate (Issue #21)
+
+Compares text generation quality between f16 KV and turbo4 KV cache
+configurations using llama.cpp's perplexity tool on the wikitext-2 corpus.
+
+Usage:
+    python3 benchmarks/run_perplexity.py \
+        --model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \
+        --llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
+        --corpus corpora/wiki.test.raw \
+        --context 2048
+
+Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass.
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+
+
+def run_perplexity(llama_bin: str, model: str, corpus: str, context: int,
+                   kv_type: str, threads: int = 4) -> dict:
+    """Run llama-perplexity and parse the output."""
+    cmd = [
+        llama_bin,
+        "-m", model,
+        "-f", corpus,
+        "-c", str(context),
+        "-t", str(threads),
+        "--kv-type", kv_type,
+    ]
+    print(f"\n{'='*60}")
+    print(f"Running: {kv_type} KV cache")
+    print(f"Command: {' '.join(cmd)}")
+    print(f"{'='*60}\n")
+
+    start = time.time()
+    try:
+        result = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=3600
+        )
+        elapsed = time.time() - start
+        output = result.stdout + "\n" + result.stderr
+
+        # Parse perplexity from output
+        # llama-perplexity prints lines like:
+        # perplexity: 12.3456 [...]
+        ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
+        ppl = float(ppl_match.group(1)) if ppl_match else None
+
+        # Parse token count
+        token_match = re.search(r"(\d+) tokens", output)
+        tokens = int(token_match.group(1)) if token_match else None
+
+        return {
+            "kv_type": kv_type,
+            "perplexity": ppl,
+            "tokens": tokens,
+            "elapsed_seconds": round(elapsed, 1),
+            "exit_code": result.returncode,
+            "passed": result.returncode == 0,
+            "output_tail": output.strip()[-500:] if output else "",
+        }
+    except subprocess.TimeoutExpired:
+        return {
+            "kv_type": kv_type,
+            "perplexity": None,
+            "elapsed_seconds": 3600,
+            "exit_code": -1,
+            "passed": False,
+            "error": "Timeout after 3600s",
+        }
+    except FileNotFoundError:
+        return {
+            "kv_type": kv_type,
+            "perplexity": None,
+            "elapsed_seconds": 0,
+            "exit_code": -1,
+            "passed": False,
+            "error": f"Binary not found: {llama_bin}",
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate")
+    parser.add_argument("--model", required=True, help="Path to GGUF model file")
+    parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
+                        help="Path to llama-perplexity binary")
+    parser.add_argument("--corpus", default="corpora/wiki.test.raw",
+                        help="Path to wikitext-2 test corpus")
+    parser.add_argument("--context", type=int, default=2048, help="Context length")
+    parser.add_argument("--threads", type=int, default=4, help="Thread count")
+    parser.add_argument("--output", default="benchmarks/perplexity_results.json",
+                        help="Output results file")
+    parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"],
+                        help="KV cache types to test")
+    parser.add_argument("--threshold", type=float, default=0.5,
+                        help="Max acceptable PPL delta (turbo4 - baseline)")
+    args = parser.parse_args()
+
+    # Validate inputs
+    for path in [args.model, args.corpus, args.llama_cpp]:
+        if not os.path.exists(path):
+            print(f"ERROR: Not found: {path}")
+            sys.exit(1)
+
+    results = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "model": os.path.basename(args.model),
+        "corpus": args.corpus,
+        "context_length": args.context,
+        "threshold": args.threshold,
+        "runs": {},
+        "pass": None,
+    }
+
+    # Run each KV type
+    for kv in args.kv_types:
+        results["runs"][kv] = run_perplexity(
+            args.llama_cpp, args.model, args.corpus,
+            args.context, kv, args.threads
+        )
+
+    # Calculate delta and pass/fail
+    baseline = results["runs"].get("f16", {})
+    turbo = results["runs"].get("turbo4", {})
+
+    if baseline.get("perplexity") and turbo.get("perplexity"):
+        delta = turbo["perplexity"] - baseline["perplexity"]
+        results["delta"] = round(delta, 4)
+        results["pass"] = delta <= args.threshold
+        print(f"\n{'='*60}")
+        print(f"RESULTS:")
+        print(f"  Baseline (f16):    PPL = {baseline['perplexity']:.4f}")
+        print(f"  Turbo4:            PPL = {turbo['perplexity']:.4f}")
+        print(f"  Delta:                   {delta:+.4f}")
+        print(f"  Threshold:               ≤ {args.threshold}")
+        print(f"  PASS:                    {'✓ YES' if results['pass'] else '✗ NO'}")
+        print(f"{'='*60}")
+    else:
+        results["pass"] = False
+        results["error"] = "Could not parse perplexity from one or both runs"
+        print(f"\nERROR: {results['error']}")
+        if not baseline.get("perplexity"):
+            print(f"  f16 run output: {baseline.get('output_tail', 'N/A')}")
+        if not turbo.get("perplexity"):
+            print(f"  turbo4 run output: {turbo.get('output_tail', 'N/A')}")
+
+    # Save results
+    os.makedirs(os.path.dirname(args.output), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {args.output}")
+
+    sys.exit(0 if results["pass"] else 1)
+
+
+if __name__ == "__main__":
+    main()