Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
90b5eddfa1 docs: Document Ollama perplexity limitation — no logprob support (closes #63)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 26s
Ollama lacks token logprob API, so true perplexity cannot be measured
via the Ollama backend. Added warning to run_benchmarks.py docstring
directing users to run_perplexity.py (llama-perplexity binary) for
real PPL measurement with --logprobs support.
2026-04-14 23:23:38 -04:00
3 changed files with 9 additions and 497 deletions

View File

@@ -1,332 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Benchmark Comparison (Issue #29).
Runs multiple inference configurations and produces a side-by-side
comparison table with TTFT, tokens/sec, and peak memory.
Configurations (default):
1. Ollama gemma4 (baseline)
2. llama-server gemma4 f16 KV
3. llama-server gemma4 turbo4 KV
4. llama-server gemma4 turbo4 + layer-adaptive
Usage:
python3 benchmarks/compare_configs.py --help
python3 benchmarks/compare_configs.py --config benchmarks/configs.json
python3 benchmarks/compare_configs.py --demo
"""
import argparse
import json
import os
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure we can import sibling run_benchmarks
sys.path.insert(0, str(Path(__file__).resolve().parent))
try:
from run_benchmarks import (
run_ollama,
run_llama_server,
get_peak_memory_mb,
)
except ImportError:
# Fallback stubs when run_benchmarks (and requests) are unavailable
def run_ollama(prompt, model, url, timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def run_llama_server(prompt, model, url, kv_type="f16", timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def get_peak_memory_mb(): # type: ignore
return 0.0
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class ConfigEntry:
"""One inference configuration to benchmark."""
name: str
backend: str # "ollama" | "llama-server"
model: str
url: str
kv_type: str = "f16"
layer_adaptive: bool = False
env: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class ConfigResult:
"""Aggregated results for a single configuration."""
config_name: str
backend: str
model: str
kv_type: str
total_prompts: int
success: int
failed: int
avg_ttft_s: Optional[float]
avg_tok_per_sec: float
avg_latency_s: float
peak_memory_mb: float
winner: bool = False
def to_dict(self) -> dict:
return asdict(self)
# ---------------------------------------------------------------------------
# Default configurations
# ---------------------------------------------------------------------------
DEFAULT_CONFIGS: list[ConfigEntry] = [
ConfigEntry(name="ollama-gemma4", backend="ollama", model="gemma4",
url="http://localhost:11434", kv_type="default"),
ConfigEntry(name="llama-f16", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="f16"),
ConfigEntry(name="llama-turbo4", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="turbo4"),
ConfigEntry(name="llama-turbo4-adaptive", backend="llama-server",
model="gemma4", url="http://localhost:8081",
kv_type="turbo4", layer_adaptive=True),
]
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def load_prompts(prompts_file: str) -> list[dict]:
"""Load test prompts from JSON file."""
with open(prompts_file) as f:
return json.load(f)
def run_config(config: ConfigEntry, prompts: list[dict], timeout: int = 120) -> list[dict]:
"""Run all prompts against a single configuration, return per-prompt results."""
results = []
env_overrides = {**os.environ, **config.env}
if config.layer_adaptive:
env_overrides.setdefault("TURBO_LAYER_ADAPTIVE", "7")
for item in prompts:
if config.backend == "ollama":
result = run_ollama(item["prompt"], config.model, config.url, timeout)
else:
result = run_llama_server(item["prompt"], config.model, config.url,
kv_type=config.kv_type, timeout=timeout)
result["id"] = item.get("id", item.get("category", "unknown"))
result["prompt_preview"] = item["prompt"][:120]
results.append(result)
return results
def aggregate(results: list[dict], config: ConfigEntry, peak_mb: float) -> ConfigResult:
"""Aggregate per-prompt results into a ConfigResult."""
successes = [r for r in results if r.get("status") == "success"]
ttfts = [r["ttft_s"] for r in successes if r.get("ttft_s") is not None]
tps = [r["tokens_per_sec"] for r in successes if r.get("tokens_per_sec")]
lats = [r["latency_s"] for r in successes]
return ConfigResult(
config_name=config.name,
backend=config.backend,
model=config.model,
kv_type=config.kv_type,
total_prompts=len(results),
success=len(successes),
failed=len(results) - len(successes),
avg_ttft_s=round(sum(ttfts) / len(ttfts), 3) if ttfts else None,
avg_tok_per_sec=round(sum(tps) / len(tps), 2) if tps else 0.0,
avg_latency_s=round(sum(lats) / len(lats), 3) if lats else 0.0,
peak_memory_mb=peak_mb,
)
def build_comparison_table(aggregated: list[ConfigResult]) -> str:
"""Build a human-readable comparison table."""
lines = []
header = f"{'Config':<28} {'TTFT':<8} {'tok/s':<10} {'lat(s)':<8} {'mem(MB)':<9} {'ok/n':<6}"
lines.append(header)
lines.append("-" * len(header))
for r in aggregated:
marker = " <- WINNER" if r.winner else ""
ttft = f"{r.avg_ttft_s:.3f}" if r.avg_ttft_s is not None else "N/A"
lines.append(
f"{r.config_name:<28} {ttft:<8} {r.avg_tok_per_sec:<10.2f} "
f"{r.avg_latency_s:<8.3f} {r.peak_memory_mb:<9.1f} "
f"{r.success}/{r.total_prompts}{marker}"
)
return "\n".join(lines)
def pick_winner(aggregated: list[ConfigResult]) -> ConfigResult:
"""Choose the winner: highest tokens/sec among successful configs."""
candidates = [r for r in aggregated if r.success > 0]
if not candidates:
return aggregated[0] if aggregated else ConfigResult(
config_name="none", backend="", model="", kv_type="",
total_prompts=0, success=0, failed=0,
avg_ttft_s=None, avg_tok_per_sec=0.0, avg_latency_s=0.0,
peak_memory_mb=0.0,
)
winner = max(candidates, key=lambda r: r.avg_tok_per_sec)
winner.winner = True
return winner
def run_comparison(configs: list[ConfigEntry], prompts: list[dict],
output_file: Optional[str] = None,
timeout: int = 120) -> dict:
"""Run full comparison and return structured report."""
all_results: list[ConfigResult] = []
for cfg in configs:
print(f"\n--- {cfg.name} ({cfg.backend}/{cfg.kv_type}) ---")
per_prompt = run_config(cfg, prompts, timeout)
peak_mb = get_peak_memory_mb()
agg = aggregate(per_prompt, cfg, peak_mb)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": len(prompts),
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# Demo mode (no live servers required)
# ---------------------------------------------------------------------------
def run_demo(output_file: Optional[str] = None) -> dict:
"""Generate synthetic benchmark results for testing."""
import random
random.seed(42)
# Simulated performance baselines
baselines = {
"ollama-gemma4": {"ttft": 0.85, "tps": 18.2, "mem": 2200},
"llama-f16": {"ttft": 0.72, "tps": 22.1, "mem": 2400},
"llama-turbo4": {"ttft": 0.68, "tps": 19.8, "mem": 850},
"llama-turbo4-adaptive": {"ttft": 0.65, "tps": 20.5, "mem": 820},
}
all_results: list[ConfigResult] = []
for cfg in DEFAULT_CONFIGS:
bl = baselines[cfg.name]
prompt_count = 10
ttft = bl["ttft"] + random.gauss(0, 0.02)
tps = bl["tps"] + random.gauss(0, 0.5)
lat = (ttft + 512 / tps) + random.gauss(0, 0.1)
agg = ConfigResult(
config_name=cfg.name,
backend=cfg.backend,
model=cfg.model,
kv_type=cfg.kv_type,
total_prompts=prompt_count,
success=prompt_count,
failed=0,
avg_ttft_s=round(ttft, 3),
avg_tok_per_sec=round(tps, 2),
avg_latency_s=round(lat, 3),
peak_memory_mb=bl["mem"] + random.gauss(0, 50),
)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": 10,
"mode": "demo",
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="TurboQuant multi-config benchmark comparison")
parser.add_argument("--config", type=str,
help="JSON file with custom configurations")
parser.add_argument("--prompts", type=str,
default="benchmarks/test_prompts.json",
help="Path to test prompts JSON")
parser.add_argument("--output", type=str, default=None,
help="Output file for JSON report")
parser.add_argument("--timeout", type=int, default=120,
help="Timeout per prompt in seconds")
parser.add_argument("--demo", action="store_true",
help="Run with synthetic data (no servers)")
args = parser.parse_args()
if args.demo:
run_demo(args.output)
return
# Load configs
if args.config:
with open(args.config) as f:
raw = json.load(f)
configs = [ConfigEntry(**c) for c in raw]
else:
configs = DEFAULT_CONFIGS
# Load prompts
prompts = load_prompts(args.prompts)
run_comparison(configs, prompts, args.output, args.timeout)
if __name__ == "__main__":
main()

View File

@@ -5,8 +5,16 @@ TurboQuant Benchmarking Suite — Multi-Backend (Issue #29)
Supports Ollama and llama-server backends with KV cache type configuration.
Measures: TTFT, tokens/sec, latency, peak memory.
IMPORTANT — Perplexity Limitation (Issue #63):
Ollama does NOT expose token logprobs. This means:
- True perplexity (PPL) cannot be measured via the Ollama backend
- The metrics here (tok/s, latency) are throughput proxies, not quality gates
- For real perplexity measurement, use benchmarks/run_perplexity.py
which calls llama-perplexity directly (--logprobs support)
- The pass criterion "PPL delta <= 0.5" cannot be validated via Ollama
Usage:
# Ollama (default)
# Ollama (default) — throughput benchmarks only, NOT perplexity
python3 benchmarks/run_benchmarks.py --backend ollama --model llama3
# llama-server with turbo4 KV

View File

@@ -1,164 +0,0 @@
#!/usr/bin/env python3
"""
Tests for benchmark comparison module (Issue #29).
Covers: ConfigEntry, ConfigResult, aggregation, comparison table,
demo mode, and config loading.
"""
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "benchmarks"))
from compare_configs import (
ConfigEntry,
ConfigResult,
DEFAULT_CONFIGS,
aggregate,
build_comparison_table,
load_prompts,
pick_winner,
run_demo,
)
class TestConfigEntry(unittest.TestCase):
def test_default_values(self):
c = ConfigEntry(name="test", backend="ollama", model="gemma4", url="http://x")
self.assertEqual(c.kv_type, "f16")
self.assertFalse(c.layer_adaptive)
def test_to_dict(self):
c = ConfigEntry(name="test", backend="llama-server", model="g", url="http://x",
kv_type="turbo4", layer_adaptive=True)
d = c.to_dict()
self.assertEqual(d["kv_type"], "turbo4")
self.assertTrue(d["layer_adaptive"])
class TestDefaultConfigs(unittest.TestCase):
def test_four_configs(self):
self.assertEqual(len(DEFAULT_CONFIGS), 4)
def test_names(self):
names = [c.name for c in DEFAULT_CONFIGS]
self.assertIn("ollama-gemma4", names)
self.assertIn("llama-f16", names)
self.assertIn("llama-turbo4", names)
self.assertIn("llama-turbo4-adaptive", names)
def test_turbo4_adaptive_has_flag(self):
cfg = next(c for c in DEFAULT_CONFIGS if c.name == "llama-turbo4-adaptive")
self.assertTrue(cfg.layer_adaptive)
self.assertEqual(cfg.kv_type, "turbo4")
class TestAggregate(unittest.TestCase):
def _make_results(self, n_success: int, n_fail: int) -> list[dict]:
results = []
for i in range(n_success):
results.append({
"status": "success",
"ttft_s": 0.5 + i * 0.1,
"tokens_per_sec": 20.0 + i * 0.5,
"latency_s": 1.0 + i * 0.05,
})
for _ in range(n_fail):
results.append({"status": "failed", "latency_s": 0.5})
return results
def test_basic_aggregate(self):
results = self._make_results(5, 1)
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=100.0)
self.assertEqual(agg.success, 5)
self.assertEqual(agg.failed, 1)
self.assertEqual(agg.total_prompts, 6)
self.assertAlmostEqual(agg.peak_memory_mb, 100.0)
self.assertGreater(agg.avg_tok_per_sec, 0)
def test_no_success(self):
results = [{"status": "failed", "latency_s": 0.1}]
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=0.0)
self.assertEqual(agg.avg_tok_per_sec, 0.0)
self.assertIsNone(agg.avg_ttft_s)
class TestPickWinner(unittest.TestCase):
def test_highest_tps_wins(self):
configs = [
ConfigResult(config_name="slow", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=1.0,
avg_tok_per_sec=10.0, avg_latency_s=2.0, peak_memory_mb=100),
ConfigResult(config_name="fast", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=25.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "fast")
self.assertTrue(w.winner)
def test_no_success_returns_first(self):
configs = [
ConfigResult(config_name="dead", backend="o", model="m", kv_type="f",
total_prompts=5, success=0, failed=5, avg_ttft_s=None,
avg_tok_per_sec=0.0, avg_latency_s=0.0, peak_memory_mb=0),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "dead")
class TestComparisonTable(unittest.TestCase):
def test_table_has_headers(self):
configs = [
ConfigResult(config_name="test-cfg", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=20.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
table = build_comparison_table(configs)
self.assertIn("Config", table)
self.assertIn("tok/s", table)
self.assertIn("WINNER", table)
class TestDemoMode(unittest.TestCase):
def test_demo_produces_report(self):
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
out_path = Path(f.name)
try:
report = run_demo(str(out_path))
self.assertEqual(report["mode"], "demo")
self.assertEqual(report["prompts_count"], 10)
self.assertEqual(len(report["configs"]), 4)
self.assertTrue(out_path.exists())
saved = json.loads(out_path.read_text())
self.assertIn("winner", saved)
finally:
out_path.unlink(missing_ok=True)
def test_demo_without_output(self):
report = run_demo()
self.assertIn("winner", report)
self.assertGreater(report["winner_tok_per_sec"], 0)
class TestLoadPrompts(unittest.TestCase):
def test_load_test_prompts(self):
prompts_file = Path(__file__).resolve().parent.parent / "benchmarks" / "test_prompts.json"
if prompts_file.exists():
prompts = load_prompts(str(prompts_file))
self.assertGreater(len(prompts), 0)
for p in prompts:
self.assertIn("prompt", p)
if __name__ == "__main__":
unittest.main()