Files
turboquant/benchmarks/run_perplexity.py
Alexander Whitestone e4f15254b3
All checks were successful
CI / test Auto-passed by Timmy review
CI / validate Auto-passed by Timmy review
Smoke Test / smoke Auto-passed by Timmy review
Review Approval Gate / verify-review Auto-passed by Timmy review
Smoke Test / smoke (pull_request) Auto-passed by Timmy review cron job
feat: wikitext-2 corpus + perplexity benchmark script (closes #21)
- Downloaded wikitext-2-raw-v1 test corpus (5782 lines, parquet→raw)
- Created benchmarks/run_perplexity.py: automated PPL quality gate
  comparing f16 vs turbo4 KV cache configurations
- Added benchmarks/perplexity_results.json template
- Script handles: subprocess execution, PPL parsing, delta calc,
  pass/fail against 0.5 threshold, JSON output

Usage: python3 benchmarks/run_perplexity.py --model <gguf> --llama-cpp <binary>
2026-04-12 00:39:14 -04:00

167 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
TurboQuant Perplexity Quality Gate (Issue #21)
Compares text generation quality between f16 KV and turbo4 KV cache
configurations using llama.cpp's perplexity tool on the wikitext-2 corpus.
Usage:
python3 benchmarks/run_perplexity.py \
--model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \
--llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
--corpus corpora/wiki.test.raw \
--context 2048
Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass.
"""
import argparse
import json
import os
import re
import subprocess
import sys
import time
from datetime import datetime, timezone
def run_perplexity(llama_bin: str, model: str, corpus: str, context: int,
kv_type: str, threads: int = 4) -> dict:
"""Run llama-perplexity and parse the output."""
cmd = [
llama_bin,
"-m", model,
"-f", corpus,
"-c", str(context),
"-t", str(threads),
"--kv-type", kv_type,
]
print(f"\n{'='*60}")
print(f"Running: {kv_type} KV cache")
print(f"Command: {' '.join(cmd)}")
print(f"{'='*60}\n")
start = time.time()
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=3600
)
elapsed = time.time() - start
output = result.stdout + "\n" + result.stderr
# Parse perplexity from output
# llama-perplexity prints lines like:
# perplexity: 12.3456 [...]
ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
ppl = float(ppl_match.group(1)) if ppl_match else None
# Parse token count
token_match = re.search(r"(\d+) tokens", output)
tokens = int(token_match.group(1)) if token_match else None
return {
"kv_type": kv_type,
"perplexity": ppl,
"tokens": tokens,
"elapsed_seconds": round(elapsed, 1),
"exit_code": result.returncode,
"passed": result.returncode == 0,
"output_tail": output.strip()[-500:] if output else "",
}
except subprocess.TimeoutExpired:
return {
"kv_type": kv_type,
"perplexity": None,
"elapsed_seconds": 3600,
"exit_code": -1,
"passed": False,
"error": "Timeout after 3600s",
}
except FileNotFoundError:
return {
"kv_type": kv_type,
"perplexity": None,
"elapsed_seconds": 0,
"exit_code": -1,
"passed": False,
"error": f"Binary not found: {llama_bin}",
}
def main():
parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate")
parser.add_argument("--model", required=True, help="Path to GGUF model file")
parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
help="Path to llama-perplexity binary")
parser.add_argument("--corpus", default="corpora/wiki.test.raw",
help="Path to wikitext-2 test corpus")
parser.add_argument("--context", type=int, default=2048, help="Context length")
parser.add_argument("--threads", type=int, default=4, help="Thread count")
parser.add_argument("--output", default="benchmarks/perplexity_results.json",
help="Output results file")
parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"],
help="KV cache types to test")
parser.add_argument("--threshold", type=float, default=0.5,
help="Max acceptable PPL delta (turbo4 - baseline)")
args = parser.parse_args()
# Validate inputs
for path in [args.model, args.corpus, args.llama_cpp]:
if not os.path.exists(path):
print(f"ERROR: Not found: {path}")
sys.exit(1)
results = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"model": os.path.basename(args.model),
"corpus": args.corpus,
"context_length": args.context,
"threshold": args.threshold,
"runs": {},
"pass": None,
}
# Run each KV type
for kv in args.kv_types:
results["runs"][kv] = run_perplexity(
args.llama_cpp, args.model, args.corpus,
args.context, kv, args.threads
)
# Calculate delta and pass/fail
baseline = results["runs"].get("f16", {})
turbo = results["runs"].get("turbo4", {})
if baseline.get("perplexity") and turbo.get("perplexity"):
delta = turbo["perplexity"] - baseline["perplexity"]
results["delta"] = round(delta, 4)
results["pass"] = delta <= args.threshold
print(f"\n{'='*60}")
print(f"RESULTS:")
print(f" Baseline (f16): PPL = {baseline['perplexity']:.4f}")
print(f" Turbo4: PPL = {turbo['perplexity']:.4f}")
print(f" Delta: {delta:+.4f}")
print(f" Threshold: ≤ {args.threshold}")
print(f" PASS: {'✓ YES' if results['pass'] else '✗ NO'}")
print(f"{'='*60}")
else:
results["pass"] = False
results["error"] = "Could not parse perplexity from one or both runs"
print(f"\nERROR: {results['error']}")
if not baseline.get("perplexity"):
print(f" f16 run output: {baseline.get('output_tail', 'N/A')}")
if not turbo.get("perplexity"):
print(f" turbo4 run output: {turbo.get('output_tail', 'N/A')}")
# Save results
os.makedirs(os.path.dirname(args.output), exist_ok=True)
with open(args.output, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults saved to {args.output}")
sys.exit(0 if results["pass"] else 1)
if __name__ == "__main__":
main()