diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py index 11367f5c..f48c089b 100644 --- a/benchmarks/run_benchmarks.py +++ b/benchmarks/run_benchmarks.py @@ -1,17 +1,26 @@ #!/usr/bin/env python3 """ -TurboQuant Benchmarking Suite — Multi-Backend (Issue #29) +TurboQuant Benchmarking Suite — Multi-Backend (Issue #29, #63) Supports Ollama and llama-server backends with KV cache type configuration. Measures: TTFT, tokens/sec, latency, peak memory. +Perplexity (quality) is NOT measured here tokens/sec is a throughput proxy. +For actual quality (logprob-based PPL), use --quality flag which delegates to +llama-perplexity binary, since Ollama lacks logprob support (issue #63). + Usage: - # Ollama (default) + # Ollama (efficiency only) python3 benchmarks/run_benchmarks.py --backend ollama --model llama3 - # llama-server with turbo4 KV + # llama-server with turbo4 KV + quality gate in one shot python3 benchmarks/run_benchmarks.py --backend llama-server \ - --url http://localhost:11434 --model qwen3.5 --kv-type turbo4 + --url http://localhost:11434 --model qwen3.5 --kv-type turbo4 --quality + + # Quality gate only (separate tool) + python3 benchmarks/run_perplexity.py --model ~/models/qwen3.5-27b.gguf \ + --llama-cpp ~/turboquant/llama.cpp-fork/build/bin/llama-perplexity \ + --corpus corpora/wiki.test.raw --context 2048 """ import argparse @@ -108,9 +117,7 @@ def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16", completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) - # llama-server includes timing in x_* headers or we estimate if elapsed > 0 and completion_tokens > 0: - # Subtract estimated prompt eval time (rough) tokens_per_sec = completion_tokens / max(elapsed - 0.1, 0.01) return { @@ -128,8 +135,10 @@ def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "f16", def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, - prompts_file: str, output_file: str, timeout: int = 120): - """Run the full benchmark suite.""" + prompts_file: str, output_file: str, timeout: int = 120, + measure_quality: bool = False, quality_corpus: str = None, + llama_cpp_bin: str = None, context: int = 2048, threads: int = 4): + """Run the full benchmark suite, optionally measuring perplexity in parallel.""" if not os.path.exists(prompts_file): print(f"ERROR: {prompts_file} not found") sys.exit(1) @@ -191,15 +200,76 @@ def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, } } + # Issue #63: Optional quality measurement via llama-perplexity (Ollama lacks logprob) + if measure_quality: + print("\n" + "="*60) + print("Quality measurement requested — invoking llama-perplexity binary...") + llama_cpp_bin = llama_cpp_bin or "llama.cpp-fork/build/bin/llama-perplexity" + quality_corpus = quality_corpus or "corpora/wiki.test.raw" + + if not os.path.exists(quality_corpus): + print(f"WARNING: quality corpus not found: {quality_corpus}") + suite["quality"] = {"perplexity": None, "passed": False, "error": f"Corpus missing: {quality_corpus}"} + elif not os.path.exists(llama_cpp_bin): + print(f"WARNING: llama-perplexity binary not found: {llama_cpp_bin}") + suite["quality"] = {"perplexity": None, "passed": False, "error": f"Binary missing: {llama_cpp_bin}"} + else: + cmd = [ + llama_cpp_bin, + "-m", model, + "-f", quality_corpus, + "-c", str(context), + "-t", str(threads), + "--kv-type", kv_type, + ] + try: + start = time.time() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) + elapsed = time.time() - start + output = result.stdout + "\n" + result.stderr + + ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE) + ppl = float(ppl_match.group(1)) if ppl_match else None + token_match = re.search(r"(\d+) tokens", output) + tokens = int(token_match.group(1)) if token_match else None + + ppl_result = { + "kv_type": kv_type, + "perplexity": ppl, + "tokens": tokens, + "elapsed_seconds": round(elapsed, 1), + "exit_code": result.returncode, + "passed": result.returncode == 0, + "output_tail": output.strip()[-500:] if output else "", + } + suite["quality"] = ppl_result + if ppl is not None: + print(f" Perplexity ({kv_type}): {ppl:.4f}") + else: + print(f" Perplexity: FAILED — could not parse output") + except subprocess.TimeoutExpired: + suite["quality"] = {"perplexity": None, "passed": False, "error": "Timeout after 3600s"} + print(" Perplexity: FAILED — timeout after 3600s") + except Exception as e: + suite["quality"] = {"perplexity": None, "passed": False, "error": str(e)} + print(f" Perplexity: FAILED — {e}") + print("="*60) + os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True) - with open(output_file, "w") as f: - json.dump(suite, f, indent=2) + with open(output_file, "w") as fh: + json.dump(suite, fh, indent=2) s = suite["summary"] print(f"\n{'='*60}") print(f"RESULTS: {s['success']}/{s['total']} success | " f"Avg {s['avg_tok_per_sec']:.1f} tok/s | " f"Avg {s['avg_latency_s']:.2f}s latency") + if "quality" in suite: + q = suite["quality"] + if q.get("perplexity") is not None: + print(f"Quality: PPL = {q['perplexity']:.4f}") + else: + print(f"Quality: not available — {q.get('error','unknown')}") print(f"{'='*60}") print(f"Saved to {output_file}") @@ -207,20 +277,45 @@ def run_benchmark_suite(backend: str, model: str, url: str, kv_type: str, def main(): parser = argparse.ArgumentParser(description="TurboQuant Benchmark Suite") parser.add_argument("--backend", choices=["ollama", "llama-server"], default="ollama") - parser.add_argument("--model", required=True, help="Model name") + parser.add_argument("--model", required=True, help="Model name or path") parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") parser.add_argument("--kv-type", default="f16", help="KV cache type (llama-server only)") parser.add_argument("--prompts", default="benchmarks/prompts.json", help="Prompts file") parser.add_argument("--output", default=None, help="Output file (auto-generated if omitted)") parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout (s)") + + # Issue #63: Quality measurement (Ollama lacks logprob → use llama-perplexity binary) + parser.add_argument("--quality", action="store_true", default=False, + help="Also run quality measurement via llama-perplexity binary") + parser.add_argument("--llama-cpp", default="llama.cpp-fork/build/bin/llama-perplexity", + help="Path to llama-perplexity binary") + parser.add_argument("--quality-corpus", default="corpora/wiki.test.raw", + help="Test corpus for perplexity measurement") + parser.add_argument("--context", type=int, default=2048, + help="Context length for quality measurement") + parser.add_argument("--threads", type=int, default=4, + help="Thread count for quality measurement") + args = parser.parse_args() if args.output is None: ts = int(time.time()) args.output = f"benchmarks/results_{args.backend}_{args.kv_type}_{ts}.json" - run_benchmark_suite(args.backend, args.model, args.url, args.kv_type, - args.prompts, args.output, args.timeout) + run_benchmark_suite( + backend=args.backend, + model=args.model, + url=args.url, + kv_type=args.kv_type, + prompts_file=args.prompts, + output_file=args.output, + timeout=args.timeout, + measure_quality=args.quality, + quality_corpus=args.quality_corpus, + llama_cpp_bin=args.llama_cpp, + context=args.context, + threads=args.threads, + ) if __name__ == "__main__": diff --git a/benchmarks/run_perplexity.py b/benchmarks/run_perplexity.py index a88ef74a..aaf46c49 100644 --- a/benchmarks/run_perplexity.py +++ b/benchmarks/run_perplexity.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 """ -TurboQuant Perplexity Quality Gate (Issue #21) +TurboQuant Perplexity Quality Gate (Issues #21, #63) -Compares text generation quality between f16 KV and turbo4 KV cache +Measures true perplexity via llama-perplexity binary (logprob-based). +Ollama cannot provide perplexity due to missing logprob API (issue #63). configurations using llama.cpp's perplexity tool on the wikitext-2 corpus. Usage: