feat: wikitext-2 corpus + perplexity benchmark script (closes #21)

- Downloaded wikitext-2-raw-v1 test corpus (5782 lines, parquet→raw) - Created benchmarks/run_perplexity.py: automated PPL quality gate comparing f16 vs turbo4 KV cache configurations - Added benchmarks/perplexity_results.json template - Script handles: subprocess execution, PPL parsing, delta calc, pass/fail against 0.5 threshold, JSON output Usage: python3 benchmarks/run_perplexity.py --model <gguf> --llama-cpp <binary>
2026-04-12 00:39:14 -04:00
parent 4c926312df
commit e4f15254b3
3 changed files with 5979 additions and 0 deletions
--- a/benchmarks/perplexity_results.json
+++ b/benchmarks/perplexity_results.json
@@ -0,0 +1,31 @@
+{
+  "timestamp": null,
+  "model": null,
+  "corpus": "corpora/wiki.test.raw",
+  "context_length": 2048,
+  "threshold": 0.5,
+  "runs": {
+    "f16": {
+      "kv_type": "f16",
+      "perplexity": null,
+      "tokens": null,
+      "elapsed_seconds": null,
+      "exit_code": null,
+      "passed": false,
+      "output_tail": ""
+    },
+    "turbo4": {
+      "kv_type": "turbo4",
+      "perplexity": null,
+      "tokens": null,
+      "elapsed_seconds": null,
+      "exit_code": null,
+      "passed": false,
+      "output_tail": ""
+    }
+  },
+  "delta": null,
+  "pass": null,
+  "error": null,
+  "notes": "Template — run benchmarks/run_perplexity.py to populate. Issue #21."
+}