From aa4bd38acf7683975139d514ec91cfc0ec19a5b7 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Thu, 16 Apr 2026 02:52:19 +0000 Subject: [PATCH] feat: perplexity quality gate with Ollama proxy fallback (#63) --- benchmarks/quality_gate.py | 308 +++++++++++++++++++++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 benchmarks/quality_gate.py diff --git a/benchmarks/quality_gate.py b/benchmarks/quality_gate.py new file mode 100644 index 00000000..066d8cdc --- /dev/null +++ b/benchmarks/quality_gate.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Perplexity Quality Gate — Unified PPL measurement for TurboQuant (#63). + +Provides a single interface for perplexity measurement regardless of backend: +- llama-server: Real perplexity via llama-perplexity with --logprobs +- Ollama: Proxy metric with documented limitations + +Usage: + # Real PPL via llama-server (recommended) + python3 benchmarks/quality_gate.py \ + --backend llama-server \ + --model ~/models/model.gguf \ + --corpus corpora/wiki.test.raw + + # Proxy PPL via Ollama (documented limitation) + python3 benchmarks/quality_gate.py \ + --backend ollama \ + --model llama3 \ + --corpus corpora/wiki.test.raw + + # CI mode — exit 1 if quality gate fails + python3 benchmarks/quality_gate.py --check --threshold 0.5 +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import textwrap +import time +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Optional + + +@dataclass +class PerplexityResult: + """Result of a perplexity measurement.""" + backend: str # "llama-server" or "ollama-proxy" + kv_type: str # "f16", "turbo4", etc. + perplexity: Optional[float] + is_proxy: bool # True if this is an approximation, not real PPL + tokens: Optional[int] = None + elapsed_seconds: float = 0.0 + method: str = "" # How PPL was measured + exit_code: int = 0 + error: Optional[str] = None + + def to_dict(self) -> dict: + return asdict(self) + + +@dataclass +class QualityGateResult: + """Result of a quality gate comparison.""" + f16: Optional[PerplexityResult] + turbo4: Optional[PerplexityResult] + delta: Optional[float] + threshold: float + passed: bool + is_proxy: bool # True if either measurement is proxy + warning: str = "" + + def summary(self) -> str: + lines = ["Perplexity Quality Gate", "=" * 40] + if self.f16: + lines.append(f" F16: PPL={self.f16.perplexity} ({self.f16.backend}, proxy={self.f16.is_proxy})") + if self.turbo4: + lines.append(f" Turbo4: PPL={self.turbo4.perplexity} ({self.turbo4.backend}, proxy={self.turbo4.is_proxy})") + if self.delta is not None: + lines.append(f" Delta: {self.delta:.4f} (threshold={self.threshold})") + status = "PASS" if self.passed else "FAIL" + lines.append(f" Result: {status}") + else: + lines.append(" Result: INCOMPLETE (missing measurements)") + if self.warning: + lines.append(f" Warning: {self.warning}") + if self.is_proxy: + lines.append(" NOTE: Proxy measurement — not real perplexity via logprobs") + return "\n".join(lines) + + def to_dict(self) -> dict: + return { + "f16": self.f16.to_dict() if self.f16 else None, + "turbo4": self.turbo4.to_dict() if self.turbo4 else None, + "delta": self.delta, + "threshold": self.threshold, + "passed": self.passed, + "is_proxy": self.is_proxy, + "warning": self.warning, + } + + +def measure_perplexity_llama_server( + llama_bin: str, model: str, corpus: str, context: int, + kv_type: str, threads: int = 4 +) -> PerplexityResult: + """Real perplexity via llama-perplexity binary (supports --logprobs).""" + cmd = [ + llama_bin, "-m", model, "-f", corpus, + "-c", str(context), "-t", str(threads), + "--kv-type", kv_type, + ] + start = time.time() + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=3600) + elapsed = time.time() - start + output = result.stdout + "\n" + result.stderr + + ppl_match = re.search(r"perplexity[:\s]+(\d+\.?\d*)", output, re.IGNORECASE) + ppl = float(ppl_match.group(1)) if ppl_match else None + + token_match = re.search(r"(\d+) tokens", output) + tokens = int(token_match.group(1)) if token_match else None + + return PerplexityResult( + backend="llama-server", + kv_type=kv_type, + perplexity=ppl, + is_proxy=False, + tokens=tokens, + elapsed_seconds=round(elapsed, 1), + method="llama-perplexity with --logprobs", + exit_code=result.returncode, + ) + except subprocess.TimeoutExpired: + return PerplexityResult( + backend="llama-server", kv_type=kv_type, perplexity=None, + is_proxy=False, elapsed_seconds=3600, method="timeout", + exit_code=-1, error="Timeout after 3600s", + ) + except FileNotFoundError: + return PerplexityResult( + backend="llama-server", kv_type=kv_type, perplexity=None, + is_proxy=False, method="binary not found", + exit_code=-1, error=f"Binary not found: {llama_bin}", + ) + + +def measure_perplexity_ollama_proxy( + model: str, corpus: str, api_base: str = "http://localhost:11434" +) -> PerplexityResult: + """ + Proxy perplexity estimation via Ollama. + + Ollama does NOT expose token logprobs. This method approximates + perplexity by measuring generation coherence on the corpus text. + + This is a PROXY metric — not real perplexity. The actual PPL delta + between FP16 and TurboQuant cannot be validated through this method. + Use llama-server for real measurements. + """ + import urllib.request + + # Read corpus sample (first 2048 chars to keep it fast) + corpus_path = Path(corpus) + if corpus_path.exists(): + sample = corpus_path.read_text()[:2048] + else: + sample = "The quick brown fox jumps over the lazy dog. " * 50 + + # Use Ollama generate API to measure token throughput + # This is the proxy metric: higher tok/s = lower effective perplexity + start = time.time() + try: + payload = json.dumps({ + "model": model, + "prompt": sample, + "stream": False, + "options": {"num_predict": 256}, + }).encode() + + req = urllib.request.Request( + f"{api_base}/api/generate", + data=payload, + headers={"Content-Type": "application/json"}, + ) + resp = urllib.request.urlopen(req, timeout=120) + data = json.loads(resp.read()) + elapsed = time.time() - start + + # Extract eval rate as proxy + eval_count = data.get("eval_count", 0) + eval_duration = data.get("eval_duration", 1) + tok_per_sec = (eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0 + + # Approximate PPL from tok/s (heuristic: faster = better quality preservation) + # This is NOT real perplexity — it's a relative proxy + proxy_ppl = max(1.0, 50.0 / max(tok_per_sec, 1.0)) + + return PerplexityResult( + backend="ollama-proxy", + kv_type="f16", # Ollama manages KV internally + perplexity=round(proxy_ppl, 2), + is_proxy=True, + tokens=eval_count, + elapsed_seconds=round(elapsed, 1), + method=f"proxy: tok/s heuristic ({tok_per_sec:.1f} tok/s)", + exit_code=0, + ) + except Exception as e: + return PerplexityResult( + backend="ollama-proxy", kv_type="f16", perplexity=None, + is_proxy=True, method="ollama proxy", + exit_code=-1, error=str(e), + ) + + +def run_quality_gate( + backend: str = "llama-server", + model: str = "", + corpus: str = "corpora/wiki.test.raw", + context: int = 2048, + threads: int = 4, + llama_bin: str = "llama.cpp-fork/build/bin/llama-perplexity", + threshold: float = 0.5, + ollama_base: str = "http://localhost:11434", +) -> QualityGateResult: + """Run quality gate: measure F16 vs Turbo4 PPL and check delta.""" + + if backend == "llama-server": + f16 = measure_perplexity_llama_server(llama_bin, model, corpus, context, "f16", threads) + turbo4 = measure_perplexity_llama_server(llama_bin, model, corpus, context, "turbo4", threads) + elif backend == "ollama": + f16 = measure_perplexity_ollama_proxy(model, corpus, ollama_base) + turbo4 = None # Can't measure turbo4 via Ollama + else: + return QualityGateResult( + f16=None, turbo4=None, delta=None, + threshold=threshold, passed=False, is_proxy=True, + warning=f"Unknown backend: {backend}", + ) + + # Compute delta + delta = None + passed = False + is_proxy = f16.is_proxy or (turbo4.is_proxy if turbo4 else True) + warning = "" + + if f16.perplexity is not None and turbo4 and turbo4.perplexity is not None: + delta = turbo4.perplexity - f16.perplexity + passed = delta <= threshold + elif f16.perplexity is not None and turbo4 is None: + warning = "Only F16 measured — cannot compute delta (turbo4 not available)" + + if is_proxy: + warning += " PROXY measurement — not real perplexity via logprobs." + + return QualityGateResult( + f16=f16, turbo4=turbo4, delta=delta, + threshold=threshold, passed=passed, + is_proxy=is_proxy, warning=warning.strip(), + ) + + +def main(): + parser = argparse.ArgumentParser(description="Perplexity Quality Gate (#63)") + parser.add_argument("--backend", choices=["llama-server", "ollama"], default="llama-server") + parser.add_argument("--model", required=True, help="Model path (GGUF) or Ollama model name") + parser.add_argument("--corpus", default="corpora/wiki.test.raw") + parser.add_argument("--context", type=int, default=2048) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--llama-bin", default="llama.cpp-fork/build/bin/llama-perplexity") + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--ollama-base", default="http://localhost:11434") + parser.add_argument("--output", default="benchmarks/perplexity_results.json") + parser.add_argument("--check", action="store_true", help="CI mode: exit 1 if gate fails") + args = parser.parse_args() + + result = run_quality_gate( + backend=args.backend, model=args.model, corpus=args.corpus, + context=args.context, threads=args.threads, llama_bin=args.llama_bin, + threshold=args.threshold, ollama_base=args.ollama_base, + ) + + print(result.summary()) + + # Save results + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + existing = {} + if output_path.exists(): + try: + existing = json.loads(output_path.read_text()) + except json.JSONDecodeError: + pass + + existing.update({ + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "model": args.model, + "corpus": args.corpus, + "context_length": args.context, + "threshold": args.threshold, + "quality_gate": result.to_dict(), + }) + output_path.write_text(json.dumps(existing, indent=2)) + + if args.check and not result.passed: + sys.exit(1) + + sys.exit(0) + + +if __name__ == "__main__": + main()