#!/usr/bin/env python3 """ TurboQuant Perplexity Quality Gate (Issue #21) Compares text generation quality between f16 KV and turbo4 KV cache configurations using llama.cpp's perplexity tool on the wikitext-2 corpus. Usage: python3 benchmarks/run_perplexity.py \ --model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \ --llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \ --corpus corpora/wiki.test.raw \ --context 2048 Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass. """ import argparse import json import os import re import subprocess import sys import time from datetime import datetime, timezone def run_perplexity(llama_bin: str, model: str, corpus: str, context: int, kv_type: str, threads: int = 4) -> dict: """Run llama-perplexity and parse the output.""" cmd = [ llama_bin, "-m", model, "-f", corpus, "-c", str(context), "-t", str(threads), "--kv-type", kv_type, ] print(f"\n{'='*60}") print(f"Running: {kv_type} KV cache") print(f"Command: {' '.join(cmd)}") print(f"{'='*60}\n") start = time.time() try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=3600 ) elapsed = time.time() - start output = result.stdout + "\n" + result.stderr # Parse perplexity from output # llama-perplexity prints lines like: # perplexity: 12.3456 [...] ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE) ppl = float(ppl_match.group(1)) if ppl_match else None # Parse token count token_match = re.search(r"(\d+) tokens", output) tokens = int(token_match.group(1)) if token_match else None return { "kv_type": kv_type, "perplexity": ppl, "tokens": tokens, "elapsed_seconds": round(elapsed, 1), "exit_code": result.returncode, "passed": result.returncode == 0, "output_tail": output.strip()[-500:] if output else "", } except subprocess.TimeoutExpired: return { "kv_type": kv_type, "perplexity": None, "elapsed_seconds": 3600, "exit_code": -1, "passed": False, "error": "Timeout after 3600s", } except FileNotFoundError: return { "kv_type": kv_type, "perplexity": None, "elapsed_seconds": 0, "exit_code": -1, "passed": False, "error": f"Binary not found: {llama_bin}", } def main(): parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate") parser.add_argument("--model", required=True, help="Path to GGUF model file") parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity", help="Path to llama-perplexity binary") parser.add_argument("--corpus", default="corpora/wiki.test.raw", help="Path to wikitext-2 test corpus") parser.add_argument("--context", type=int, default=2048, help="Context length") parser.add_argument("--threads", type=int, default=4, help="Thread count") parser.add_argument("--output", default="benchmarks/perplexity_results.json", help="Output results file") parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"], help="KV cache types to test") parser.add_argument("--threshold", type=float, default=0.5, help="Max acceptable PPL delta (turbo4 - baseline)") args = parser.parse_args() # Validate inputs for path in [args.model, args.corpus, args.llama_cpp]: if not os.path.exists(path): print(f"ERROR: Not found: {path}") sys.exit(1) results = { "timestamp": datetime.now(timezone.utc).isoformat(), "model": os.path.basename(args.model), "corpus": args.corpus, "context_length": args.context, "threshold": args.threshold, "runs": {}, "pass": None, } # Run each KV type for kv in args.kv_types: results["runs"][kv] = run_perplexity( args.llama_cpp, args.model, args.corpus, args.context, kv, args.threads ) # Calculate delta and pass/fail baseline = results["runs"].get("f16", {}) turbo = results["runs"].get("turbo4", {}) if baseline.get("perplexity") and turbo.get("perplexity"): delta = turbo["perplexity"] - baseline["perplexity"] results["delta"] = round(delta, 4) results["pass"] = delta <= args.threshold print(f"\n{'='*60}") print(f"RESULTS:") print(f" Baseline (f16): PPL = {baseline['perplexity']:.4f}") print(f" Turbo4: PPL = {turbo['perplexity']:.4f}") print(f" Delta: {delta:+.4f}") print(f" Threshold: ≤ {args.threshold}") print(f" PASS: {'✓ YES' if results['pass'] else '✗ NO'}") print(f"{'='*60}") else: results["pass"] = False results["error"] = "Could not parse perplexity from one or both runs" print(f"\nERROR: {results['error']}") if not baseline.get("perplexity"): print(f" f16 run output: {baseline.get('output_tail', 'N/A')}") if not turbo.get("perplexity"): print(f" turbo4 run output: {turbo.get('output_tail', 'N/A')}") # Save results os.makedirs(os.path.dirname(args.output), exist_ok=True) with open(args.output, "w") as f: json.dump(results, f, indent=2) print(f"\nResults saved to {args.output}") sys.exit(0 if results["pass"] else 1) if __name__ == "__main__": main()