All checks were successful
CI / test Auto-passed by Timmy review
CI / validate Auto-passed by Timmy review
Smoke Test / smoke Auto-passed by Timmy review
Review Approval Gate / verify-review Auto-passed by Timmy review
Smoke Test / smoke (pull_request) Auto-passed by Timmy review cron job
- Downloaded wikitext-2-raw-v1 test corpus (5782 lines, parquet→raw) - Created benchmarks/run_perplexity.py: automated PPL quality gate comparing f16 vs turbo4 KV cache configurations - Added benchmarks/perplexity_results.json template - Script handles: subprocess execution, PPL parsing, delta calc, pass/fail against 0.5 threshold, JSON output Usage: python3 benchmarks/run_perplexity.py --model <gguf> --llama-cpp <binary>
167 lines
5.8 KiB
Python
167 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TurboQuant Perplexity Quality Gate (Issue #21)
|
|
|
|
Compares text generation quality between f16 KV and turbo4 KV cache
|
|
configurations using llama.cpp's perplexity tool on the wikitext-2 corpus.
|
|
|
|
Usage:
|
|
python3 benchmarks/run_perplexity.py \
|
|
--model ~/models/hermes4-14b/NousResearch_Hermes-4-14B-Q4_K_M.gguf \
|
|
--llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \
|
|
--corpus corpora/wiki.test.raw \
|
|
--context 2048
|
|
|
|
Acceptance: PPL delta (turbo4 - f16) must be ≤ 0.5 to pass.
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def run_perplexity(llama_bin: str, model: str, corpus: str, context: int,
|
|
kv_type: str, threads: int = 4) -> dict:
|
|
"""Run llama-perplexity and parse the output."""
|
|
cmd = [
|
|
llama_bin,
|
|
"-m", model,
|
|
"-f", corpus,
|
|
"-c", str(context),
|
|
"-t", str(threads),
|
|
"--kv-type", kv_type,
|
|
]
|
|
print(f"\n{'='*60}")
|
|
print(f"Running: {kv_type} KV cache")
|
|
print(f"Command: {' '.join(cmd)}")
|
|
print(f"{'='*60}\n")
|
|
|
|
start = time.time()
|
|
try:
|
|
result = subprocess.run(
|
|
cmd, capture_output=True, text=True, timeout=3600
|
|
)
|
|
elapsed = time.time() - start
|
|
output = result.stdout + "\n" + result.stderr
|
|
|
|
# Parse perplexity from output
|
|
# llama-perplexity prints lines like:
|
|
# perplexity: 12.3456 [...]
|
|
ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE)
|
|
ppl = float(ppl_match.group(1)) if ppl_match else None
|
|
|
|
# Parse token count
|
|
token_match = re.search(r"(\d+) tokens", output)
|
|
tokens = int(token_match.group(1)) if token_match else None
|
|
|
|
return {
|
|
"kv_type": kv_type,
|
|
"perplexity": ppl,
|
|
"tokens": tokens,
|
|
"elapsed_seconds": round(elapsed, 1),
|
|
"exit_code": result.returncode,
|
|
"passed": result.returncode == 0,
|
|
"output_tail": output.strip()[-500:] if output else "",
|
|
}
|
|
except subprocess.TimeoutExpired:
|
|
return {
|
|
"kv_type": kv_type,
|
|
"perplexity": None,
|
|
"elapsed_seconds": 3600,
|
|
"exit_code": -1,
|
|
"passed": False,
|
|
"error": "Timeout after 3600s",
|
|
}
|
|
except FileNotFoundError:
|
|
return {
|
|
"kv_type": kv_type,
|
|
"perplexity": None,
|
|
"elapsed_seconds": 0,
|
|
"exit_code": -1,
|
|
"passed": False,
|
|
"error": f"Binary not found: {llama_bin}",
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="TurboQuant Perplexity Quality Gate")
|
|
parser.add_argument("--model", required=True, help="Path to GGUF model file")
|
|
parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity",
|
|
help="Path to llama-perplexity binary")
|
|
parser.add_argument("--corpus", default="corpora/wiki.test.raw",
|
|
help="Path to wikitext-2 test corpus")
|
|
parser.add_argument("--context", type=int, default=2048, help="Context length")
|
|
parser.add_argument("--threads", type=int, default=4, help="Thread count")
|
|
parser.add_argument("--output", default="benchmarks/perplexity_results.json",
|
|
help="Output results file")
|
|
parser.add_argument("--kv-types", nargs="+", default=["f16", "turbo4"],
|
|
help="KV cache types to test")
|
|
parser.add_argument("--threshold", type=float, default=0.5,
|
|
help="Max acceptable PPL delta (turbo4 - baseline)")
|
|
args = parser.parse_args()
|
|
|
|
# Validate inputs
|
|
for path in [args.model, args.corpus, args.llama_cpp]:
|
|
if not os.path.exists(path):
|
|
print(f"ERROR: Not found: {path}")
|
|
sys.exit(1)
|
|
|
|
results = {
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"model": os.path.basename(args.model),
|
|
"corpus": args.corpus,
|
|
"context_length": args.context,
|
|
"threshold": args.threshold,
|
|
"runs": {},
|
|
"pass": None,
|
|
}
|
|
|
|
# Run each KV type
|
|
for kv in args.kv_types:
|
|
results["runs"][kv] = run_perplexity(
|
|
args.llama_cpp, args.model, args.corpus,
|
|
args.context, kv, args.threads
|
|
)
|
|
|
|
# Calculate delta and pass/fail
|
|
baseline = results["runs"].get("f16", {})
|
|
turbo = results["runs"].get("turbo4", {})
|
|
|
|
if baseline.get("perplexity") and turbo.get("perplexity"):
|
|
delta = turbo["perplexity"] - baseline["perplexity"]
|
|
results["delta"] = round(delta, 4)
|
|
results["pass"] = delta <= args.threshold
|
|
print(f"\n{'='*60}")
|
|
print(f"RESULTS:")
|
|
print(f" Baseline (f16): PPL = {baseline['perplexity']:.4f}")
|
|
print(f" Turbo4: PPL = {turbo['perplexity']:.4f}")
|
|
print(f" Delta: {delta:+.4f}")
|
|
print(f" Threshold: ≤ {args.threshold}")
|
|
print(f" PASS: {'✓ YES' if results['pass'] else '✗ NO'}")
|
|
print(f"{'='*60}")
|
|
else:
|
|
results["pass"] = False
|
|
results["error"] = "Could not parse perplexity from one or both runs"
|
|
print(f"\nERROR: {results['error']}")
|
|
if not baseline.get("perplexity"):
|
|
print(f" f16 run output: {baseline.get('output_tail', 'N/A')}")
|
|
if not turbo.get("perplexity"):
|
|
print(f" turbo4 run output: {turbo.get('output_tail', 'N/A')}")
|
|
|
|
# Save results
|
|
os.makedirs(os.path.dirname(args.output), exist_ok=True)
|
|
with open(args.output, "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print(f"\nResults saved to {args.output}")
|
|
|
|
sys.exit(0 if results["pass"] else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|