Compare commits

..

3 Commits

Author SHA1 Message Date
36b2b07fcc Merge pull request 'feat: Auto-start llama.cpp server for tool call regression tests (#118)' (#151) from fix/118-auto-start-server-fixture into main
All checks were successful
Smoke Test / smoke (push) Successful in 12s
2026-05-05 12:55:25 +00:00
9ed8cd3cae feat: add auto-start server fixture (#118)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 21s
- turboquant_server_url fixture: auto-starts llama-server if no URL provided
- Finds binary in standard locations or PATH
- Finds GGUF model in standard locations
- Configurable via env vars (port, kv_type, ctx_size, timeout)
- Skips gracefully if binary or model not found
- turboquant_model_name fixture for model discovery
2026-04-21 11:52:26 +00:00
82ab8b22c3 feat: add server manager for auto-start fixture (#118) 2026-04-21 11:51:22 +00:00
4 changed files with 280 additions and 497 deletions

View File

@@ -1,332 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Benchmark Comparison (Issue #29).
Runs multiple inference configurations and produces a side-by-side
comparison table with TTFT, tokens/sec, and peak memory.
Configurations (default):
1. Ollama gemma4 (baseline)
2. llama-server gemma4 f16 KV
3. llama-server gemma4 turbo4 KV
4. llama-server gemma4 turbo4 + layer-adaptive
Usage:
python3 benchmarks/compare_configs.py --help
python3 benchmarks/compare_configs.py --config benchmarks/configs.json
python3 benchmarks/compare_configs.py --demo
"""
import argparse
import json
import os
import sys
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure we can import sibling run_benchmarks
sys.path.insert(0, str(Path(__file__).resolve().parent))
try:
from run_benchmarks import (
run_ollama,
run_llama_server,
get_peak_memory_mb,
)
except ImportError:
# Fallback stubs when run_benchmarks (and requests) are unavailable
def run_ollama(prompt, model, url, timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def run_llama_server(prompt, model, url, kv_type="f16", timeout=120): # type: ignore
return {"status": "skipped", "error": "run_benchmarks not available", "latency_s": 0}
def get_peak_memory_mb(): # type: ignore
return 0.0
# ---------------------------------------------------------------------------
# Data structures
# ---------------------------------------------------------------------------
@dataclass
class ConfigEntry:
"""One inference configuration to benchmark."""
name: str
backend: str # "ollama" | "llama-server"
model: str
url: str
kv_type: str = "f16"
layer_adaptive: bool = False
env: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class ConfigResult:
"""Aggregated results for a single configuration."""
config_name: str
backend: str
model: str
kv_type: str
total_prompts: int
success: int
failed: int
avg_ttft_s: Optional[float]
avg_tok_per_sec: float
avg_latency_s: float
peak_memory_mb: float
winner: bool = False
def to_dict(self) -> dict:
return asdict(self)
# ---------------------------------------------------------------------------
# Default configurations
# ---------------------------------------------------------------------------
DEFAULT_CONFIGS: list[ConfigEntry] = [
ConfigEntry(name="ollama-gemma4", backend="ollama", model="gemma4",
url="http://localhost:11434", kv_type="default"),
ConfigEntry(name="llama-f16", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="f16"),
ConfigEntry(name="llama-turbo4", backend="llama-server", model="gemma4",
url="http://localhost:8081", kv_type="turbo4"),
ConfigEntry(name="llama-turbo4-adaptive", backend="llama-server",
model="gemma4", url="http://localhost:8081",
kv_type="turbo4", layer_adaptive=True),
]
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def load_prompts(prompts_file: str) -> list[dict]:
"""Load test prompts from JSON file."""
with open(prompts_file) as f:
return json.load(f)
def run_config(config: ConfigEntry, prompts: list[dict], timeout: int = 120) -> list[dict]:
"""Run all prompts against a single configuration, return per-prompt results."""
results = []
env_overrides = {**os.environ, **config.env}
if config.layer_adaptive:
env_overrides.setdefault("TURBO_LAYER_ADAPTIVE", "7")
for item in prompts:
if config.backend == "ollama":
result = run_ollama(item["prompt"], config.model, config.url, timeout)
else:
result = run_llama_server(item["prompt"], config.model, config.url,
kv_type=config.kv_type, timeout=timeout)
result["id"] = item.get("id", item.get("category", "unknown"))
result["prompt_preview"] = item["prompt"][:120]
results.append(result)
return results
def aggregate(results: list[dict], config: ConfigEntry, peak_mb: float) -> ConfigResult:
"""Aggregate per-prompt results into a ConfigResult."""
successes = [r for r in results if r.get("status") == "success"]
ttfts = [r["ttft_s"] for r in successes if r.get("ttft_s") is not None]
tps = [r["tokens_per_sec"] for r in successes if r.get("tokens_per_sec")]
lats = [r["latency_s"] for r in successes]
return ConfigResult(
config_name=config.name,
backend=config.backend,
model=config.model,
kv_type=config.kv_type,
total_prompts=len(results),
success=len(successes),
failed=len(results) - len(successes),
avg_ttft_s=round(sum(ttfts) / len(ttfts), 3) if ttfts else None,
avg_tok_per_sec=round(sum(tps) / len(tps), 2) if tps else 0.0,
avg_latency_s=round(sum(lats) / len(lats), 3) if lats else 0.0,
peak_memory_mb=peak_mb,
)
def build_comparison_table(aggregated: list[ConfigResult]) -> str:
"""Build a human-readable comparison table."""
lines = []
header = f"{'Config':<28} {'TTFT':<8} {'tok/s':<10} {'lat(s)':<8} {'mem(MB)':<9} {'ok/n':<6}"
lines.append(header)
lines.append("-" * len(header))
for r in aggregated:
marker = " <- WINNER" if r.winner else ""
ttft = f"{r.avg_ttft_s:.3f}" if r.avg_ttft_s is not None else "N/A"
lines.append(
f"{r.config_name:<28} {ttft:<8} {r.avg_tok_per_sec:<10.2f} "
f"{r.avg_latency_s:<8.3f} {r.peak_memory_mb:<9.1f} "
f"{r.success}/{r.total_prompts}{marker}"
)
return "\n".join(lines)
def pick_winner(aggregated: list[ConfigResult]) -> ConfigResult:
"""Choose the winner: highest tokens/sec among successful configs."""
candidates = [r for r in aggregated if r.success > 0]
if not candidates:
return aggregated[0] if aggregated else ConfigResult(
config_name="none", backend="", model="", kv_type="",
total_prompts=0, success=0, failed=0,
avg_ttft_s=None, avg_tok_per_sec=0.0, avg_latency_s=0.0,
peak_memory_mb=0.0,
)
winner = max(candidates, key=lambda r: r.avg_tok_per_sec)
winner.winner = True
return winner
def run_comparison(configs: list[ConfigEntry], prompts: list[dict],
output_file: Optional[str] = None,
timeout: int = 120) -> dict:
"""Run full comparison and return structured report."""
all_results: list[ConfigResult] = []
for cfg in configs:
print(f"\n--- {cfg.name} ({cfg.backend}/{cfg.kv_type}) ---")
per_prompt = run_config(cfg, prompts, timeout)
peak_mb = get_peak_memory_mb()
agg = aggregate(per_prompt, cfg, peak_mb)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": len(prompts),
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# Demo mode (no live servers required)
# ---------------------------------------------------------------------------
def run_demo(output_file: Optional[str] = None) -> dict:
"""Generate synthetic benchmark results for testing."""
import random
random.seed(42)
# Simulated performance baselines
baselines = {
"ollama-gemma4": {"ttft": 0.85, "tps": 18.2, "mem": 2200},
"llama-f16": {"ttft": 0.72, "tps": 22.1, "mem": 2400},
"llama-turbo4": {"ttft": 0.68, "tps": 19.8, "mem": 850},
"llama-turbo4-adaptive": {"ttft": 0.65, "tps": 20.5, "mem": 820},
}
all_results: list[ConfigResult] = []
for cfg in DEFAULT_CONFIGS:
bl = baselines[cfg.name]
prompt_count = 10
ttft = bl["ttft"] + random.gauss(0, 0.02)
tps = bl["tps"] + random.gauss(0, 0.5)
lat = (ttft + 512 / tps) + random.gauss(0, 0.1)
agg = ConfigResult(
config_name=cfg.name,
backend=cfg.backend,
model=cfg.model,
kv_type=cfg.kv_type,
total_prompts=prompt_count,
success=prompt_count,
failed=0,
avg_ttft_s=round(ttft, 3),
avg_tok_per_sec=round(tps, 2),
avg_latency_s=round(lat, 3),
peak_memory_mb=bl["mem"] + random.gauss(0, 50),
)
all_results.append(agg)
winner = pick_winner(all_results)
table = build_comparison_table(all_results)
report = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"prompts_count": 10,
"mode": "demo",
"winner": winner.config_name,
"winner_tok_per_sec": winner.avg_tok_per_sec,
"configs": [r.to_dict() for r in all_results],
"table": table,
}
print(f"\n{table}")
print(f"\nWinner: {winner.config_name} ({winner.avg_tok_per_sec:.2f} tok/s)")
if output_file:
os.makedirs(os.path.dirname(output_file) or ".", exist_ok=True)
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Report saved to {output_file}")
return report
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="TurboQuant multi-config benchmark comparison")
parser.add_argument("--config", type=str,
help="JSON file with custom configurations")
parser.add_argument("--prompts", type=str,
default="benchmarks/test_prompts.json",
help="Path to test prompts JSON")
parser.add_argument("--output", type=str, default=None,
help="Output file for JSON report")
parser.add_argument("--timeout", type=int, default=120,
help="Timeout per prompt in seconds")
parser.add_argument("--demo", action="store_true",
help="Run with synthetic data (no servers)")
args = parser.parse_args()
if args.demo:
run_demo(args.output)
return
# Load configs
if args.config:
with open(args.config) as f:
raw = json.load(f)
configs = [ConfigEntry(**c) for c in raw]
else:
configs = DEFAULT_CONFIGS
# Load prompts
prompts = load_prompts(args.prompts)
run_comparison(configs, prompts, args.output, args.timeout)
if __name__ == "__main__":
main()

View File

@@ -1,3 +1,85 @@
"""Pytest configuration for turboquant."""
import sys, os
import os
import sys
import pytest
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
@pytest.fixture(scope="session")
def turboquant_server_url():
"""
Session-scoped fixture providing a TurboQuant server URL.
If TURBOQUANT_SERVER_URL is set, uses that directly.
Otherwise, auto-starts a llama-server with TurboQuant flags.
Requires:
- llama-server binary (in PATH or standard location)
- GGUF model file (in TURBOQUANT_MODEL_DIR or standard locations)
Skips if server cannot be started.
"""
# If URL already provided, use it
if os.environ.get("TURBOQUANT_SERVER_URL"):
yield os.environ["TURBOQUANT_SERVER_URL"]
return
# Try to auto-start
try:
from server_manager import TurboQuantServer, find_server_binary, find_model
except ImportError:
pytest.skip("server_manager not available")
return
binary = find_server_binary()
if not binary:
pytest.skip("llama-server binary not found — install llama-cpp-turboquant")
return
model = find_model()
if not model:
pytest.skip("No GGUF model found — set TURBOQUANT_MODEL_DIR or place model in ~/models")
return
port = int(os.environ.get("TURBOQUANT_TEST_PORT", "18081"))
kv_type = os.environ.get("TURBOQUANT_KV_TYPE", "turbo4")
ctx_size = int(os.environ.get("TURBOQUANT_CTX_SIZE", "8192"))
timeout = float(os.environ.get("TURBOQUANT_STARTUP_TIMEOUT", "60"))
server = TurboQuantServer(
model_path=model,
port=port,
kv_type=kv_type,
context_size=ctx_size,
server_binary=binary,
timeout=timeout,
)
try:
url = server.start()
yield url
except Exception as e:
pytest.skip(f"Could not start TurboQuant server: {e}")
finally:
server.stop()
@pytest.fixture(scope="session")
def turboquant_model_name(turboquant_server_url):
"""Get the model name from the running server."""
import json
import urllib.request
try:
req = urllib.request.Request(f"{turboquant_server_url}/v1/models")
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
models = data.get("data", [])
if models:
return models[0].get("id", "unknown")
except Exception:
pass
return "gemma-4"

197
tests/server_manager.py Normal file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python3
"""
TurboQuant Server Manager
Manages llama-server lifecycle for integration tests:
- Start server with TurboQuant flags
- Wait for health check
- Stop server on teardown
Usage:
from tests.server_manager import TurboQuantServer
with TurboQuantServer(model_path="/path/to/model.gguf") as server:
url = server.url # e.g. http://localhost:8081
# Run tests against server
"""
import json
import os
import signal
import subprocess
import sys
import time
import urllib.request
import urllib.error
from pathlib import Path
from typing import Optional
class TurboQuantServer:
"""Context manager for llama-server with TurboQuant."""
def __init__(
self,
model_path: str,
port: int = 8081,
kv_type: str = "turbo4",
context_size: int = 32768,
server_binary: Optional[str] = None,
timeout: float = 60.0,
host: str = "127.0.0.1",
):
self.model_path = model_path
self.port = port
self.kv_type = kv_type
self.context_size = context_size
self.timeout = timeout
self.host = host
# Find server binary
if server_binary:
self.server_binary = server_binary
else:
# Try common locations
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
Path("llama-server"), # PATH
]
self.server_binary = None
for c in candidates:
if c.exists() or c.name == "llama-server":
try:
subprocess.run([str(c), "--help"], capture_output=True, timeout=5)
self.server_binary = str(c)
break
except (FileNotFoundError, subprocess.TimeoutExpired):
continue
self.process: Optional[subprocess.Popen] = None
@property
def url(self) -> str:
return f"http://{self.host}:{self.port}"
def _build_command(self) -> list:
cmd = [
self.server_binary,
"-m", self.model_path,
"--port", str(self.port),
"--host", self.host,
"-ctk", self.kv_type,
"-ctv", self.kv_type,
"-c", str(self.context_size),
]
return cmd
def _check_health(self) -> bool:
try:
req = urllib.request.Request(f"{self.url}/v1/models")
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
return "data" in data and len(data.get("data", [])) > 0
except Exception:
return False
def start(self) -> str:
"""Start the server and wait for it to be healthy. Returns the server URL."""
if not self.server_binary:
raise RuntimeError(
"llama-server binary not found. Set server_binary or install to standard location."
)
if not Path(self.model_path).exists():
raise FileNotFoundError(f"Model not found: {self.model_path}")
cmd = self._build_command()
# Set TurboQuant env
env = os.environ.copy()
env["TURBO_LAYER_ADAPTIVE"] = "7"
self.process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env,
)
# Wait for health
start = time.time()
while time.time() - start < self.timeout:
if self.process.poll() is not None:
stderr = self.process.stderr.read().decode() if self.process.stderr else ""
raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}")
if self._check_health():
return self.url
time.sleep(1.0)
self.stop()
raise TimeoutError(f"Server did not become healthy within {self.timeout}s")
def stop(self):
"""Stop the server."""
if self.process:
try:
self.process.send_signal(signal.SIGTERM)
self.process.wait(timeout=10)
except subprocess.TimeoutExpired:
self.process.kill()
self.process.wait(timeout=5)
except Exception:
pass
self.process = None
def __enter__(self) -> "TurboQuantServer":
self.start()
return self
def __exit__(self, *args):
self.stop()
def find_server_binary() -> Optional[str]:
"""Find llama-server binary in common locations."""
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
]
for c in candidates:
if c.exists():
return str(c)
# Try PATH
try:
result = subprocess.run(["which", "llama-server"], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def find_model(model_dir: Optional[str] = None) -> Optional[str]:
"""Find a GGUF model file."""
search_dirs = [
model_dir,
os.environ.get("TURBOQUANT_MODEL_DIR"),
str(Path.home() / "models"),
"/opt/models",
"/tmp/models",
]
for d in search_dirs:
if not d:
continue
p = Path(d)
if p.is_file() and p.suffix == ".gguf":
return str(p)
if p.is_dir():
for f in sorted(p.rglob("*.gguf")):
return str(f)
return None

View File

@@ -1,164 +0,0 @@
#!/usr/bin/env python3
"""
Tests for benchmark comparison module (Issue #29).
Covers: ConfigEntry, ConfigResult, aggregation, comparison table,
demo mode, and config loading.
"""
import json
import os
import sys
import tempfile
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "benchmarks"))
from compare_configs import (
ConfigEntry,
ConfigResult,
DEFAULT_CONFIGS,
aggregate,
build_comparison_table,
load_prompts,
pick_winner,
run_demo,
)
class TestConfigEntry(unittest.TestCase):
def test_default_values(self):
c = ConfigEntry(name="test", backend="ollama", model="gemma4", url="http://x")
self.assertEqual(c.kv_type, "f16")
self.assertFalse(c.layer_adaptive)
def test_to_dict(self):
c = ConfigEntry(name="test", backend="llama-server", model="g", url="http://x",
kv_type="turbo4", layer_adaptive=True)
d = c.to_dict()
self.assertEqual(d["kv_type"], "turbo4")
self.assertTrue(d["layer_adaptive"])
class TestDefaultConfigs(unittest.TestCase):
def test_four_configs(self):
self.assertEqual(len(DEFAULT_CONFIGS), 4)
def test_names(self):
names = [c.name for c in DEFAULT_CONFIGS]
self.assertIn("ollama-gemma4", names)
self.assertIn("llama-f16", names)
self.assertIn("llama-turbo4", names)
self.assertIn("llama-turbo4-adaptive", names)
def test_turbo4_adaptive_has_flag(self):
cfg = next(c for c in DEFAULT_CONFIGS if c.name == "llama-turbo4-adaptive")
self.assertTrue(cfg.layer_adaptive)
self.assertEqual(cfg.kv_type, "turbo4")
class TestAggregate(unittest.TestCase):
def _make_results(self, n_success: int, n_fail: int) -> list[dict]:
results = []
for i in range(n_success):
results.append({
"status": "success",
"ttft_s": 0.5 + i * 0.1,
"tokens_per_sec": 20.0 + i * 0.5,
"latency_s": 1.0 + i * 0.05,
})
for _ in range(n_fail):
results.append({"status": "failed", "latency_s": 0.5})
return results
def test_basic_aggregate(self):
results = self._make_results(5, 1)
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=100.0)
self.assertEqual(agg.success, 5)
self.assertEqual(agg.failed, 1)
self.assertEqual(agg.total_prompts, 6)
self.assertAlmostEqual(agg.peak_memory_mb, 100.0)
self.assertGreater(agg.avg_tok_per_sec, 0)
def test_no_success(self):
results = [{"status": "failed", "latency_s": 0.1}]
cfg = ConfigEntry(name="test", backend="ollama", model="m", url="http://x")
agg = aggregate(results, cfg, peak_mb=0.0)
self.assertEqual(agg.avg_tok_per_sec, 0.0)
self.assertIsNone(agg.avg_ttft_s)
class TestPickWinner(unittest.TestCase):
def test_highest_tps_wins(self):
configs = [
ConfigResult(config_name="slow", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=1.0,
avg_tok_per_sec=10.0, avg_latency_s=2.0, peak_memory_mb=100),
ConfigResult(config_name="fast", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=25.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "fast")
self.assertTrue(w.winner)
def test_no_success_returns_first(self):
configs = [
ConfigResult(config_name="dead", backend="o", model="m", kv_type="f",
total_prompts=5, success=0, failed=5, avg_ttft_s=None,
avg_tok_per_sec=0.0, avg_latency_s=0.0, peak_memory_mb=0),
]
w = pick_winner(configs)
self.assertEqual(w.config_name, "dead")
class TestComparisonTable(unittest.TestCase):
def test_table_has_headers(self):
configs = [
ConfigResult(config_name="test-cfg", backend="o", model="m", kv_type="f",
total_prompts=5, success=5, failed=0, avg_ttft_s=0.5,
avg_tok_per_sec=20.0, avg_latency_s=1.5, peak_memory_mb=100),
]
w = pick_winner(configs)
table = build_comparison_table(configs)
self.assertIn("Config", table)
self.assertIn("tok/s", table)
self.assertIn("WINNER", table)
class TestDemoMode(unittest.TestCase):
def test_demo_produces_report(self):
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
out_path = Path(f.name)
try:
report = run_demo(str(out_path))
self.assertEqual(report["mode"], "demo")
self.assertEqual(report["prompts_count"], 10)
self.assertEqual(len(report["configs"]), 4)
self.assertTrue(out_path.exists())
saved = json.loads(out_path.read_text())
self.assertIn("winner", saved)
finally:
out_path.unlink(missing_ok=True)
def test_demo_without_output(self):
report = run_demo()
self.assertIn("winner", report)
self.assertGreater(report["winner_tok_per_sec"], 0)
class TestLoadPrompts(unittest.TestCase):
def test_load_test_prompts(self):
prompts_file = Path(__file__).resolve().parent.parent / "benchmarks" / "test_prompts.json"
if prompts_file.exists():
prompts = load_prompts(str(prompts_file))
self.assertGreater(len(prompts), 0)
for p in prompts:
self.assertIn("prompt", p)
if __name__ == "__main__":
unittest.main()