feat: wikitext-2 corpus + perplexity benchmark script (closes #21)
All checks were successful
CI / test Auto-passed by Timmy review
CI / validate Auto-passed by Timmy review
Smoke Test / smoke Auto-passed by Timmy review
Review Approval Gate / verify-review Auto-passed by Timmy review
Smoke Test / smoke (pull_request) Auto-passed by Timmy review cron job
All checks were successful
CI / test Auto-passed by Timmy review
CI / validate Auto-passed by Timmy review
Smoke Test / smoke Auto-passed by Timmy review
Review Approval Gate / verify-review Auto-passed by Timmy review
Smoke Test / smoke (pull_request) Auto-passed by Timmy review cron job
- Downloaded wikitext-2-raw-v1 test corpus (5782 lines, parquet→raw) - Created benchmarks/run_perplexity.py: automated PPL quality gate comparing f16 vs turbo4 KV cache configurations - Added benchmarks/perplexity_results.json template - Script handles: subprocess execution, PPL parsing, delta calc, pass/fail against 0.5 threshold, JSON output Usage: python3 benchmarks/run_perplexity.py --model <gguf> --llama-cpp <binary>
This commit is contained in:
166
benchmarks/run_perplexity.py
Normal file
166
benchmarks/run_perplexity.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TurboQuant Perplexity Quality Gate (Issue #21)
|
||||
|
||||
Compares text generation quality between f16 KV and turbo4 KV cache
|
||||
configurations using llama.cpp's perplexity tool on the wikitext-2 corpus.
|
||||
|
||||
Usage:
|
||||
python3 benchmarks/run_perplexity.py \
|
||||
--model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \
|
||||
--llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
|
||||
--corpus corpora/wiki.test.raw \
|
||||
--context 2048
|
||||
|
||||
Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
def run_perplexity(llama_bin: str, model: str, corpus: str, context: int,
|
||||
kv_type: str, threads: int = 4) -> dict:
|
||||
"""Run llama-perplexity and parse the output."""
|
||||
cmd = [
|
||||
llama_bin,
|
||||
"-m", model,
|
||||
"-f", corpus,
|
||||
"-c", str(context),
|
||||
"-t", str(threads),
|
||||
"--kv-type", kv_type,
|
||||
]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Running: {kv_type} KV cache")
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd, capture_output=True, text=True, timeout=3600
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
output = result.stdout + "\n" + result.stderr
|
||||
|
||||
# Parse perplexity from output
|
||||
# llama-perplexity prints lines like:
|
||||
# perplexity: 12.3456 [...]
|
||||
ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
|
||||
ppl = float(ppl_match.group(1)) if ppl_match else None
|
||||
|
||||
# Parse token count
|
||||
token_match = re.search(r"(\d+) tokens", output)
|
||||
tokens = int(token_match.group(1)) if token_match else None
|
||||
|
||||
return {
|
||||
"kv_type": kv_type,
|
||||
"perplexity": ppl,
|
||||
"tokens": tokens,
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
"exit_code": result.returncode,
|
||||
"passed": result.returncode == 0,
|
||||
"output_tail": output.strip()[-500:] if output else "",
|
||||
}
|
||||
except subprocess.TimeoutExpired:
|
||||
return {
|
||||
"kv_type": kv_type,
|
||||
"perplexity": None,
|
||||
"elapsed_seconds": 3600,
|
||||
"exit_code": -1,
|
||||
"passed": False,
|
||||
"error": "Timeout after 3600s",
|
||||
}
|
||||
except FileNotFoundError:
|
||||
return {
|
||||
"kv_type": kv_type,
|
||||
"perplexity": None,
|
||||
"elapsed_seconds": 0,
|
||||
"exit_code": -1,
|
||||
"passed": False,
|
||||
"error": f"Binary not found: {llama_bin}",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate")
|
||||
parser.add_argument("--model", required=True, help="Path to GGUF model file")
|
||||
parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
|
||||
help="Path to llama-perplexity binary")
|
||||
parser.add_argument("--corpus", default="corpora/wiki.test.raw",
|
||||
help="Path to wikitext-2 test corpus")
|
||||
parser.add_argument("--context", type=int, default=2048, help="Context length")
|
||||
parser.add_argument("--threads", type=int, default=4, help="Thread count")
|
||||
parser.add_argument("--output", default="benchmarks/perplexity_results.json",
|
||||
help="Output results file")
|
||||
parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"],
|
||||
help="KV cache types to test")
|
||||
parser.add_argument("--threshold", type=float, default=0.5,
|
||||
help="Max acceptable PPL delta (turbo4 - baseline)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate inputs
|
||||
for path in [args.model, args.corpus, args.llama_cpp]:
|
||||
if not os.path.exists(path):
|
||||
print(f"ERROR: Not found: {path}")
|
||||
sys.exit(1)
|
||||
|
||||
results = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"model": os.path.basename(args.model),
|
||||
"corpus": args.corpus,
|
||||
"context_length": args.context,
|
||||
"threshold": args.threshold,
|
||||
"runs": {},
|
||||
"pass": None,
|
||||
}
|
||||
|
||||
# Run each KV type
|
||||
for kv in args.kv_types:
|
||||
results["runs"][kv] = run_perplexity(
|
||||
args.llama_cpp, args.model, args.corpus,
|
||||
args.context, kv, args.threads
|
||||
)
|
||||
|
||||
# Calculate delta and pass/fail
|
||||
baseline = results["runs"].get("f16", {})
|
||||
turbo = results["runs"].get("turbo4", {})
|
||||
|
||||
if baseline.get("perplexity") and turbo.get("perplexity"):
|
||||
delta = turbo["perplexity"] - baseline["perplexity"]
|
||||
results["delta"] = round(delta, 4)
|
||||
results["pass"] = delta <= args.threshold
|
||||
print(f"\n{'='*60}")
|
||||
print(f"RESULTS:")
|
||||
print(f" Baseline (f16): PPL = {baseline['perplexity']:.4f}")
|
||||
print(f" Turbo4: PPL = {turbo['perplexity']:.4f}")
|
||||
print(f" Delta: {delta:+.4f}")
|
||||
print(f" Threshold: ≤ {args.threshold}")
|
||||
print(f" PASS: {'✓ YES' if results['pass'] else '✗ NO'}")
|
||||
print(f"{'='*60}")
|
||||
else:
|
||||
results["pass"] = False
|
||||
results["error"] = "Could not parse perplexity from one or both runs"
|
||||
print(f"\nERROR: {results['error']}")
|
||||
if not baseline.get("perplexity"):
|
||||
print(f" f16 run output: {baseline.get('output_tail', 'N/A')}")
|
||||
if not turbo.get("perplexity"):
|
||||
print(f" turbo4 run output: {turbo.get('output_tail', 'N/A')}")
|
||||
|
||||
# Save results
|
||||
os.makedirs(os.path.dirname(args.output), exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\nResults saved to {args.output}")
|
||||
|
||||
sys.exit(0 if results["pass"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user