All checks were successful
Smoke Test / smoke (pull_request) Successful in 8s
- profiles/allegro-cpu-presets.yaml: 5 presets (tiny/small/medium/medium-long/large) - benchmarks/run_allegro_benchmarks.py: --dry-run, --all, --preset, --markdown - benchmarks/allegro-2026-04-14.md: analysis & expected results - tests/test_allegro_benchmarks.py: 19 smoke tests (preset validation, runner) Deliverables for issue #95: benchmark TurboQuant presets on Allegro VPS (2 cores, 8 GB RAM). Runner integrates with existing llama-server backend. Presets tuned to ~6 GB usable memory budget; large preset needs swap. Closes #95
349 lines
12 KiB
Python
349 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Allegro VPS Benchmark Runner — Issue #95
|
|
|
|
Iterates preset configurations, benchmarks against a local llama-server
|
|
with the specified TurboQuant KV settings, and produces JSON + Markdown reports.
|
|
|
|
Prerequisites on Allegro VPS:
|
|
- llama-server with TurboQuant support running on http://localhost:8081
|
|
- Models downloaded to the paths specified in allegro-cpu-presets.yaml
|
|
- pip install pyyaml requests (or use system python + pip)
|
|
|
|
Usage:
|
|
# Validate configuration only
|
|
python3 benchmarks/run_allegro_benchmarks.py --dry-run
|
|
|
|
# Run all presets and emit markdown table
|
|
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
|
|
|
# Run a single preset (after updating model_path in the YAML)
|
|
python3 benchmarks/run_allegro_benchmarks.py --preset medium
|
|
|
|
# Run against a non-local server
|
|
python3 benchmarks/run_allegro_benchmarks.py --url http://192.168.1.100:8081 --all
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
import requests
|
|
|
|
# ─── Paths ────────────────────────────────────────────────────────────────────
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
|
|
PROMPTS_PATH = REPO_ROOT / "benchmarks" / "prompts.json"
|
|
RESULTS_DIR = REPO_ROOT / "benchmarks" / "results"
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
# ─── Preset loader ────────────────────────────────────────────────────────────
|
|
def load_presets() -> List[Dict]:
|
|
"""Load preset list from allegro-cpu-presets.yaml."""
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("ERROR: PyYAML required. Install: pip install pyyaml", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
with open(PROFILE_PATH) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
presets = data.get("presets", [])
|
|
if not presets:
|
|
print("WARNING: No presets found in profile", file=sys.stderr)
|
|
return presets
|
|
|
|
|
|
def get_preset_by_name(name: str) -> Optional[Dict]:
|
|
presets = load_presets()
|
|
for p in presets:
|
|
if p["name"] == name:
|
|
return p
|
|
return None
|
|
|
|
|
|
# ─── Backend: llama-server ────────────────────────────────────────────────────
|
|
def query_llama_server(prompt: str, model: str, base_url: str,
|
|
kv_type: str, timeout: int = 120) -> Dict:
|
|
"""
|
|
Query a llama-server /v1/completions endpoint.
|
|
|
|
Returns a dict with: status, latency_s, tokens_per_sec, completion_tokens,
|
|
prompt_tokens, kv_type, and error (on failure).
|
|
"""
|
|
api_url = f"{base_url.rstrip('/')}/v1/completions"
|
|
start = time.time()
|
|
|
|
try:
|
|
resp = requests.post(
|
|
api_url,
|
|
json={
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"max_tokens": 64, # Short responses keep benchmark snappy
|
|
"temperature": 0.7,
|
|
"stream": False,
|
|
},
|
|
timeout=timeout,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
usage = data.get("usage", {})
|
|
completion_tokens = usage.get("completion_tokens", 0)
|
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
|
|
elapsed = time.time() - start
|
|
# Estimate tokens/sec (subtract 0.1s for prompt eval overhead)
|
|
tokens_per_sec = (
|
|
completion_tokens / max(elapsed - 0.1, 0.01)
|
|
if completion_tokens > 0 else 0.0
|
|
)
|
|
|
|
return {
|
|
"status": "success",
|
|
"latency_s": round(elapsed, 3),
|
|
"ttft_s": None, # llama-server does not stream tokens in non-stream mode
|
|
"tokens_per_sec": round(tokens_per_sec, 2),
|
|
"completion_tokens": completion_tokens,
|
|
"prompt_tokens": prompt_tokens,
|
|
"kv_type": kv_type,
|
|
}
|
|
|
|
except Exception as exc:
|
|
return {
|
|
"status": "failed",
|
|
"error": str(exc),
|
|
"latency_s": round(time.time() - start, 3),
|
|
"tokens_per_sec": 0.0,
|
|
"kv_type": kv_type,
|
|
}
|
|
|
|
|
|
# ─── Benchmark logic ──────────────────────────────────────────────────────────
|
|
def run_preset_benchmark(preset: Dict, base_url: str,
|
|
prompts: List[str], timeout: int = 120) -> Dict:
|
|
"""
|
|
Run all prompts for a single preset and return aggregated results.
|
|
|
|
Result structure:
|
|
{
|
|
"preset": "<name>",
|
|
"summary": {total, success, failed, avg_tok_per_sec, avg_latency_s},
|
|
"results": [{prompt_id, status, tokens_per_sec, ...}, ...]
|
|
}
|
|
"""
|
|
model_path = preset["model_path"]
|
|
kv_type = preset["kv_type"]
|
|
preset_name = preset["name"]
|
|
|
|
print(f"\n[{preset_name}] model={model_path} kv={kv_type}")
|
|
|
|
results = []
|
|
for idx, prompt in enumerate(prompts, start=1):
|
|
run = query_llama_server(prompt, model_path, base_url, kv_type, timeout)
|
|
run["preset"] = preset_name
|
|
run["prompt_id"] = idx
|
|
run["prompt_preview"] = prompt[:80]
|
|
|
|
status_sym = "✓" if run["status"] == "success" else "✗"
|
|
tps = run.get("tokens_per_sec", 0.0)
|
|
print(f" [{idx}] {status_sym} {tps:.1f} tok/s", flush=True)
|
|
results.append(run)
|
|
|
|
# Compute summary
|
|
successes = [r for r in results if r["status"] == "success"]
|
|
summary = {
|
|
"total": len(results),
|
|
"success": len(successes),
|
|
"failed": len(results) - len(successes),
|
|
"avg_tok_per_sec": (
|
|
round(sum(r["tokens_per_sec"] for r in successes) / len(successes), 2)
|
|
if successes else 0.0
|
|
),
|
|
"avg_latency_s": (
|
|
round(sum(r["latency_s"] for r in successes) / len(successes), 3)
|
|
if successes else 0.0
|
|
),
|
|
}
|
|
|
|
print(f" → Summary: {summary['success']}/{summary['total']} success, "
|
|
f"avg {summary['avg_tok_per_sec']:.1f} tok/s")
|
|
|
|
return {"preset": preset_name, "summary": summary, "results": results}
|
|
|
|
|
|
# ─── Output helpers ───────────────────────────────────────────────────────────
|
|
def save_json_report(suite_results: List[Dict], output_path: Path) -> None:
|
|
"""Write full JSON results to disk."""
|
|
payload = {
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
"generator": "run_allegro_benchmarks.py",
|
|
"vps": {
|
|
"host": "Allegro (167.99.126.228)",
|
|
"cpu_cores": 2,
|
|
"ram_gb": 8,
|
|
},
|
|
"presets": [p["name"] for p in load_presets()],
|
|
"results": suite_results,
|
|
}
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
json.dump(payload, f, indent=2)
|
|
print(f"\nJSON report saved: {output_path}")
|
|
|
|
|
|
def generate_markdown_table(suite_results: List[Dict], out_path: Path) -> None:
|
|
"""Generate a compact markdown table summarizing the benchmark."""
|
|
lines = [
|
|
"# Allegro VPS Benchmark Results — TurboQuant Presets",
|
|
"",
|
|
f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
|
|
"",
|
|
"| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Runs? | Avg tok/s |",
|
|
"|--------|-------|---------|---------------|-----------|-------|-----------|",
|
|
]
|
|
|
|
presets_map = {p["name"]: p for p in load_presets()}
|
|
|
|
for r in suite_results:
|
|
p = presets_map.get(r["preset"])
|
|
if p is None:
|
|
continue
|
|
fits_emoji = "✅" if p.get("fits_6gb_budget") else "❌"
|
|
s = r["summary"]
|
|
if s["success"] == s["total"]:
|
|
runs_emoji = "✅"
|
|
else:
|
|
runs_emoji = f"❌ {s['failed']}/{s['total']}"
|
|
lines.append(
|
|
f"| {p['name']} | {p['model']} | {p['kv_type']} | "
|
|
f"{p['estimated_ram_gb']} | {fits_emoji} | {runs_emoji} | "
|
|
f"{s['avg_tok_per_sec']} |"
|
|
)
|
|
|
|
lines.extend([
|
|
"",
|
|
"**Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS",
|
|
"**Server:** llama-server with TurboQuant Metal/CUDA build on CPU backend",
|
|
"**Prompts:** `benchmarks/prompts.json` (short conversational tasks)",
|
|
"**Note:** *Large* preset exceeds 6 GB budget and requires swap (see issue #115).",
|
|
])
|
|
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_text("\n".join(lines))
|
|
print(f"Markdown table saved: {out_path}")
|
|
|
|
|
|
# ─── Main ─────────────────────────────────────────────────────────────────────
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Allegro VPS benchmark runner — test TurboQuant presets"
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
default="http://localhost:8081",
|
|
help="llama-server base URL (default: http://localhost:8081)",
|
|
)
|
|
parser.add_argument(
|
|
"--prompts",
|
|
default=str(PROMPTS_PATH),
|
|
help="Path to prompts.json (default: benchmarks/prompts.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
default=None,
|
|
help="JSON output path (default: benchmarks/results/allegro_<ts>.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--markdown",
|
|
action="store_true",
|
|
help="Also write markdown report alongside JSON",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Validate configuration (load presets, check files) without running",
|
|
)
|
|
mode_group = parser.add_mutually_exclusive_group()
|
|
mode_group.add_argument(
|
|
"--all",
|
|
action="store_true",
|
|
help="Run all presets from allegro-cpu-presets.yaml",
|
|
)
|
|
mode_group.add_argument(
|
|
"--preset",
|
|
default=None,
|
|
help="Run only the named preset (e.g. 'medium')",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Ensure prompts file exists
|
|
if not Path(args.prompts).exists():
|
|
print(f"ERROR: Prompts file not found: {args.prompts}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
with open(args.prompts) as f:
|
|
prompts_data = json.load(f)
|
|
prompts = [p["prompt"] for p in prompts_data if "prompt" in p]
|
|
if not prompts:
|
|
print("ERROR: No prompts found in prompts file", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Dry-run mode
|
|
if args.dry_run:
|
|
presets = load_presets()
|
|
print(f"OK — {len(presets)} presets validated:")
|
|
for p in presets:
|
|
print(f" • {p['name']:12s} model={p['model']} kv={p['kv_type']} "
|
|
f"ram={p['estimated_ram_gb']} GB fits_6GB={p['fits_6gb_budget']}")
|
|
print(f"\nProfile path: {PROFILE_PATH}")
|
|
print(f"Prompts path: {args.prompts}")
|
|
sys.exit(0)
|
|
|
|
# Select presets to run
|
|
if args.preset:
|
|
preset = get_preset_by_name(args.preset)
|
|
if not preset:
|
|
print(f"ERROR: Preset '{args.preset}' not found. Available: "
|
|
f"{', '.join(p['name'] for p in load_presets())}", file=sys.stderr)
|
|
sys.exit(1)
|
|
presets_to_run = [preset]
|
|
else: # --all is default when neither --preset nor positional given
|
|
presets_to_run = load_presets()
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Allegro VPS Benchmark — {len(presets_to_run)} preset(s)")
|
|
print(f"Server: {args.url}")
|
|
print(f"Prompts: {len(prompts)} from {args.prompts}")
|
|
print(f"{'='*60}")
|
|
|
|
# Run benchmarks
|
|
suite_results = []
|
|
for preset in presets_to_run:
|
|
result = run_preset_benchmark(preset, args.url, prompts, timeout=120)
|
|
suite_results.append(result)
|
|
|
|
# Save outputs
|
|
ts = int(time.time())
|
|
json_out = Path(args.output) if args.output else RESULTS_DIR / f"allegro_{ts}.json"
|
|
save_json_report(suite_results, json_out)
|
|
|
|
if args.markdown:
|
|
md_out = json_out.with_suffix(".md")
|
|
generate_markdown_table(suite_results, md_out)
|
|
|
|
print("\nDone.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|