Compare commits
1 Commits
step35/67-
...
step35/95-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb2f7b0aa7 |
56
benchmarks/allegro-2026-04-14.md
Normal file
56
benchmarks/allegro-2026-04-14.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Allegro VPS Benchmark Analysis — TurboQuant Presets
|
||||
|
||||
*Generated: 2026-04-26*
|
||||
|
||||
> **Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS
|
||||
> **Server:** `llama-server` with TurboQuant KV compression (CPU backend)
|
||||
> **Scope:** Compare TurboQuant preset configurations for memory vs. throughput trade-offs
|
||||
|
||||
## Preset Summary
|
||||
|
||||
| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Target |
|
||||
|--------|-------|---------|---------------|-----------|--------|
|
||||
| tiny | 2B Q4 | f16 | 2.8 | ✅ | Baseline |
|
||||
| small | 3B Q4 | turbo2 | 3.6 | ✅ | Best throughput |
|
||||
| medium | 7B Q4 | turbo4 | 5.2 | ✅ | **Recommended** (quality within budget) |
|
||||
| medium-long | 7B Q4 | turbo4 (q3_k) | 5.8 | ✅ | Extended context |
|
||||
| large | 14B Q3 | turbo4 | 7.2 | ❌ | Requires swap |
|
||||
|
||||
## Expected Results — Qualitative
|
||||
|
||||
| Preset | Expected tok/s | Notes |
|
||||
|--------|---------------|-------|
|
||||
| tiny | 8–15 | Fast baseline, no KV compression |
|
||||
| small | 5–10 | 2-bit KV compression, good speed |
|
||||
| medium | 2–5 | 4-bit KV compression, balanced |
|
||||
| medium-long | 1.5–4 | Better model quant, longer context |
|
||||
| large | 0.5–2 | Large model; swap may bottleneck |
|
||||
|
||||
> **Recommendation (medium):** Best quality within the 6 GB usable memory budget on Allegro.
|
||||
> 7B Q4 with turbo4 KV gives ~5.2 GB total; 14B requires swap (issue #115).
|
||||
|
||||
## Running the Benchmarks
|
||||
|
||||
```bash
|
||||
# Validate configuration (does not hit the server)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --dry-run
|
||||
|
||||
# Run all presets and produce both JSON and markdown table
|
||||
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
|
||||
# Run a single preset (after filling in model_path in the YAML)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --preset medium
|
||||
```
|
||||
|
||||
## Deliverables
|
||||
|
||||
- ✅ `profiles/allegro-cpu-presets.yaml` — preset configurations
|
||||
- ✅ `benchmarks/run_allegro_benchmarks.py` — runner script
|
||||
- ✅ `benchmarks/allegro-2026-04-14.md` — this analysis (expected results)
|
||||
- ✅ `tests/test_allegro_benchmarks.py` — smoke tests for preset loading/validation
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Place GGUF model files at the `model_path` locations in `allegro-cpu-presets.yaml`.
|
||||
2. Ensure llama-server with TurboQuant is running on port 8081.
|
||||
3. Run `--all --markdown` and commit the generated `allegro-<timestamp>.md` results.
|
||||
348
benchmarks/run_allegro_benchmarks.py
Normal file
348
benchmarks/run_allegro_benchmarks.py
Normal file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Allegro VPS Benchmark Runner — Issue #95
|
||||
|
||||
Iterates preset configurations, benchmarks against a local llama-server
|
||||
with the specified TurboQuant KV settings, and produces JSON + Markdown reports.
|
||||
|
||||
Prerequisites on Allegro VPS:
|
||||
- llama-server with TurboQuant support running on http://localhost:8081
|
||||
- Models downloaded to the paths specified in allegro-cpu-presets.yaml
|
||||
- pip install pyyaml requests (or use system python + pip)
|
||||
|
||||
Usage:
|
||||
# Validate configuration only
|
||||
python3 benchmarks/run_allegro_benchmarks.py --dry-run
|
||||
|
||||
# Run all presets and emit markdown table
|
||||
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
|
||||
# Run a single preset (after updating model_path in the YAML)
|
||||
python3 benchmarks/run_allegro_benchmarks.py --preset medium
|
||||
|
||||
# Run against a non-local server
|
||||
python3 benchmarks/run_allegro_benchmarks.py --url http://192.168.1.100:8081 --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
# ─── Paths ────────────────────────────────────────────────────────────────────
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
|
||||
PROMPTS_PATH = REPO_ROOT / "benchmarks" / "prompts.json"
|
||||
RESULTS_DIR = REPO_ROOT / "benchmarks" / "results"
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# ─── Preset loader ────────────────────────────────────────────────────────────
|
||||
def load_presets() -> List[Dict]:
|
||||
"""Load preset list from allegro-cpu-presets.yaml."""
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
print("ERROR: PyYAML required. Install: pip install pyyaml", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
presets = data.get("presets", [])
|
||||
if not presets:
|
||||
print("WARNING: No presets found in profile", file=sys.stderr)
|
||||
return presets
|
||||
|
||||
|
||||
def get_preset_by_name(name: str) -> Optional[Dict]:
|
||||
presets = load_presets()
|
||||
for p in presets:
|
||||
if p["name"] == name:
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
# ─── Backend: llama-server ────────────────────────────────────────────────────
|
||||
def query_llama_server(prompt: str, model: str, base_url: str,
|
||||
kv_type: str, timeout: int = 120) -> Dict:
|
||||
"""
|
||||
Query a llama-server /v1/completions endpoint.
|
||||
|
||||
Returns a dict with: status, latency_s, tokens_per_sec, completion_tokens,
|
||||
prompt_tokens, kv_type, and error (on failure).
|
||||
"""
|
||||
api_url = f"{base_url.rstrip('/')}/v1/completions"
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
api_url,
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": 64, # Short responses keep benchmark snappy
|
||||
"temperature": 0.7,
|
||||
"stream": False,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
usage = data.get("usage", {})
|
||||
completion_tokens = usage.get("completion_tokens", 0)
|
||||
prompt_tokens = usage.get("prompt_tokens", 0)
|
||||
|
||||
elapsed = time.time() - start
|
||||
# Estimate tokens/sec (subtract 0.1s for prompt eval overhead)
|
||||
tokens_per_sec = (
|
||||
completion_tokens / max(elapsed - 0.1, 0.01)
|
||||
if completion_tokens > 0 else 0.0
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"latency_s": round(elapsed, 3),
|
||||
"ttft_s": None, # llama-server does not stream tokens in non-stream mode
|
||||
"tokens_per_sec": round(tokens_per_sec, 2),
|
||||
"completion_tokens": completion_tokens,
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"kv_type": kv_type,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
return {
|
||||
"status": "failed",
|
||||
"error": str(exc),
|
||||
"latency_s": round(time.time() - start, 3),
|
||||
"tokens_per_sec": 0.0,
|
||||
"kv_type": kv_type,
|
||||
}
|
||||
|
||||
|
||||
# ─── Benchmark logic ──────────────────────────────────────────────────────────
|
||||
def run_preset_benchmark(preset: Dict, base_url: str,
|
||||
prompts: List[str], timeout: int = 120) -> Dict:
|
||||
"""
|
||||
Run all prompts for a single preset and return aggregated results.
|
||||
|
||||
Result structure:
|
||||
{
|
||||
"preset": "<name>",
|
||||
"summary": {total, success, failed, avg_tok_per_sec, avg_latency_s},
|
||||
"results": [{prompt_id, status, tokens_per_sec, ...}, ...]
|
||||
}
|
||||
"""
|
||||
model_path = preset["model_path"]
|
||||
kv_type = preset["kv_type"]
|
||||
preset_name = preset["name"]
|
||||
|
||||
print(f"\n[{preset_name}] model={model_path} kv={kv_type}")
|
||||
|
||||
results = []
|
||||
for idx, prompt in enumerate(prompts, start=1):
|
||||
run = query_llama_server(prompt, model_path, base_url, kv_type, timeout)
|
||||
run["preset"] = preset_name
|
||||
run["prompt_id"] = idx
|
||||
run["prompt_preview"] = prompt[:80]
|
||||
|
||||
status_sym = "✓" if run["status"] == "success" else "✗"
|
||||
tps = run.get("tokens_per_sec", 0.0)
|
||||
print(f" [{idx}] {status_sym} {tps:.1f} tok/s", flush=True)
|
||||
results.append(run)
|
||||
|
||||
# Compute summary
|
||||
successes = [r for r in results if r["status"] == "success"]
|
||||
summary = {
|
||||
"total": len(results),
|
||||
"success": len(successes),
|
||||
"failed": len(results) - len(successes),
|
||||
"avg_tok_per_sec": (
|
||||
round(sum(r["tokens_per_sec"] for r in successes) / len(successes), 2)
|
||||
if successes else 0.0
|
||||
),
|
||||
"avg_latency_s": (
|
||||
round(sum(r["latency_s"] for r in successes) / len(successes), 3)
|
||||
if successes else 0.0
|
||||
),
|
||||
}
|
||||
|
||||
print(f" → Summary: {summary['success']}/{summary['total']} success, "
|
||||
f"avg {summary['avg_tok_per_sec']:.1f} tok/s")
|
||||
|
||||
return {"preset": preset_name, "summary": summary, "results": results}
|
||||
|
||||
|
||||
# ─── Output helpers ───────────────────────────────────────────────────────────
|
||||
def save_json_report(suite_results: List[Dict], output_path: Path) -> None:
|
||||
"""Write full JSON results to disk."""
|
||||
payload = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"generator": "run_allegro_benchmarks.py",
|
||||
"vps": {
|
||||
"host": "Allegro (167.99.126.228)",
|
||||
"cpu_cores": 2,
|
||||
"ram_gb": 8,
|
||||
},
|
||||
"presets": [p["name"] for p in load_presets()],
|
||||
"results": suite_results,
|
||||
}
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(payload, f, indent=2)
|
||||
print(f"\nJSON report saved: {output_path}")
|
||||
|
||||
|
||||
def generate_markdown_table(suite_results: List[Dict], out_path: Path) -> None:
|
||||
"""Generate a compact markdown table summarizing the benchmark."""
|
||||
lines = [
|
||||
"# Allegro VPS Benchmark Results — TurboQuant Presets",
|
||||
"",
|
||||
f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
|
||||
"",
|
||||
"| Preset | Model | KV Type | Est. RAM (GB) | Fits 6GB? | Runs? | Avg tok/s |",
|
||||
"|--------|-------|---------|---------------|-----------|-------|-----------|",
|
||||
]
|
||||
|
||||
presets_map = {p["name"]: p for p in load_presets()}
|
||||
|
||||
for r in suite_results:
|
||||
p = presets_map.get(r["preset"])
|
||||
if p is None:
|
||||
continue
|
||||
fits_emoji = "✅" if p.get("fits_6gb_budget") else "❌"
|
||||
s = r["summary"]
|
||||
if s["success"] == s["total"]:
|
||||
runs_emoji = "✅"
|
||||
else:
|
||||
runs_emoji = f"❌ {s['failed']}/{s['total']}"
|
||||
lines.append(
|
||||
f"| {p['name']} | {p['model']} | {p['kv_type']} | "
|
||||
f"{p['estimated_ram_gb']} | {fits_emoji} | {runs_emoji} | "
|
||||
f"{s['avg_tok_per_sec']} |"
|
||||
)
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"**Hardware:** Allegro VPS — 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 LTS",
|
||||
"**Server:** llama-server with TurboQuant Metal/CUDA build on CPU backend",
|
||||
"**Prompts:** `benchmarks/prompts.json` (short conversational tasks)",
|
||||
"**Note:** *Large* preset exceeds 6 GB budget and requires swap (see issue #115).",
|
||||
])
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text("\n".join(lines))
|
||||
print(f"Markdown table saved: {out_path}")
|
||||
|
||||
|
||||
# ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Allegro VPS benchmark runner — test TurboQuant presets"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default="http://localhost:8081",
|
||||
help="llama-server base URL (default: http://localhost:8081)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompts",
|
||||
default=str(PROMPTS_PATH),
|
||||
help="Path to prompts.json (default: benchmarks/prompts.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=None,
|
||||
help="JSON output path (default: benchmarks/results/allegro_<ts>.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--markdown",
|
||||
action="store_true",
|
||||
help="Also write markdown report alongside JSON",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Validate configuration (load presets, check files) without running",
|
||||
)
|
||||
mode_group = parser.add_mutually_exclusive_group()
|
||||
mode_group.add_argument(
|
||||
"--all",
|
||||
action="store_true",
|
||||
help="Run all presets from allegro-cpu-presets.yaml",
|
||||
)
|
||||
mode_group.add_argument(
|
||||
"--preset",
|
||||
default=None,
|
||||
help="Run only the named preset (e.g. 'medium')",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure prompts file exists
|
||||
if not Path(args.prompts).exists():
|
||||
print(f"ERROR: Prompts file not found: {args.prompts}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(args.prompts) as f:
|
||||
prompts_data = json.load(f)
|
||||
prompts = [p["prompt"] for p in prompts_data if "prompt" in p]
|
||||
if not prompts:
|
||||
print("ERROR: No prompts found in prompts file", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Dry-run mode
|
||||
if args.dry_run:
|
||||
presets = load_presets()
|
||||
print(f"OK — {len(presets)} presets validated:")
|
||||
for p in presets:
|
||||
print(f" • {p['name']:12s} model={p['model']} kv={p['kv_type']} "
|
||||
f"ram={p['estimated_ram_gb']} GB fits_6GB={p['fits_6gb_budget']}")
|
||||
print(f"\nProfile path: {PROFILE_PATH}")
|
||||
print(f"Prompts path: {args.prompts}")
|
||||
sys.exit(0)
|
||||
|
||||
# Select presets to run
|
||||
if args.preset:
|
||||
preset = get_preset_by_name(args.preset)
|
||||
if not preset:
|
||||
print(f"ERROR: Preset '{args.preset}' not found. Available: "
|
||||
f"{', '.join(p['name'] for p in load_presets())}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
presets_to_run = [preset]
|
||||
else: # --all is default when neither --preset nor positional given
|
||||
presets_to_run = load_presets()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Allegro VPS Benchmark — {len(presets_to_run)} preset(s)")
|
||||
print(f"Server: {args.url}")
|
||||
print(f"Prompts: {len(prompts)} from {args.prompts}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Run benchmarks
|
||||
suite_results = []
|
||||
for preset in presets_to_run:
|
||||
result = run_preset_benchmark(preset, args.url, prompts, timeout=120)
|
||||
suite_results.append(result)
|
||||
|
||||
# Save outputs
|
||||
ts = int(time.time())
|
||||
json_out = Path(args.output) if args.output else RESULTS_DIR / f"allegro_{ts}.json"
|
||||
save_json_report(suite_results, json_out)
|
||||
|
||||
if args.markdown:
|
||||
md_out = json_out.with_suffix(".md")
|
||||
generate_markdown_table(suite_results, md_out)
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
75
profiles/allegro-cpu-presets.yaml
Normal file
75
profiles/allegro-cpu-presets.yaml
Normal file
@@ -0,0 +1,75 @@
|
||||
# Allegro VPS TurboQuant Preset Configurations
|
||||
# Issue: #95 — Benchmark TurboQuant presets on Allegro VPS (2 cores, 8 GB RAM)
|
||||
#
|
||||
# Hardware: 2 vCPU cores, 8 GB RAM, Ubuntu 24.04 (VPS)
|
||||
# Memory budget: ~6 GB usable for model + KV cache after OS/services overhead
|
||||
#
|
||||
# Usage:
|
||||
# python3 benchmarks/run_allegro_benchmarks.py --all --markdown
|
||||
# python3 benchmarks/run_allegro_benchmarks.py --preset medium --dry-run
|
||||
#
|
||||
# Preset semantics:
|
||||
# name: Human-readable preset label
|
||||
# model: Human model descriptor (for documentation)
|
||||
# model_path: Absolute GGUF path on the VPS (user must provide)
|
||||
# kv_type: TurboQuant KV compression level (turbo4/turbo2/f16/q4_0/etc.)
|
||||
# estimated_ram_gb: Total estimated RAM usage (model + KV + overhead)
|
||||
# fits_6gb_budget: True if estimated RAM fits within 6 GB memory budget
|
||||
# estimated_tok_per_sec: Expected throughput range (tok/s) on 2-core CPU
|
||||
#
|
||||
# Notes:
|
||||
# - turbo2: 2-bit (1.5 bits/channel), fastest, lower quality
|
||||
# - turbo4: 4-bit (3.5 bits/channel), best quality, slower
|
||||
# - f16: no compression, used for baseline comparison
|
||||
# - q3_k: Q3_K_M quantization (alternative medium-quality preset)
|
||||
#
|
||||
# The VPS needs swap configured for models marked fits_6gb_budget: false.
|
||||
# See issue #115 for Allegro swap configuration.
|
||||
|
||||
presets:
|
||||
- name: tiny
|
||||
model: "2B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/2b-q4_k_m.gguf" # USER: replace with actual path
|
||||
kv_type: "f16"
|
||||
estimated_ram_gb: 2.8
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "8-15"
|
||||
description: "Baseline: tiny model, no KV compression"
|
||||
|
||||
- name: small
|
||||
model: "3B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/3b-q4_k_m.gguf"
|
||||
kv_type: "turbo2"
|
||||
estimated_ram_gb: 3.6
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "5-10"
|
||||
description: "Best throughput; 2-bit KV compression"
|
||||
|
||||
- name: medium
|
||||
model: "7B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/7b-q4_k_m.gguf"
|
||||
kv_type: "turbo4"
|
||||
estimated_ram_gb: 5.2
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "2-5"
|
||||
description: "Recommended: best quality within 6 GB budget"
|
||||
|
||||
- name: medium-long
|
||||
model: "7B Q4 (Q4_K_M)"
|
||||
model_path: "/path/to/7b-q4_k_m.gguf"
|
||||
kv_type: "turbo4_q3_k" # turbo4-level quality, q3_k model quant
|
||||
estimated_ram_gb: 5.8
|
||||
fits_6gb_budget: true
|
||||
estimated_tok_per_sec: "1.5-4"
|
||||
description: "Extended context, 7B with better model quantization"
|
||||
|
||||
- name: large
|
||||
model: "14B Q3 (Q3_K_M)"
|
||||
model_path: "/path/to/14b-q3_k_m.gguf"
|
||||
kv_type: "turbo4"
|
||||
estimated_ram_gb: 7.2
|
||||
fits_6gb_budget: false
|
||||
estimated_tok_per_sec: "0.5-2"
|
||||
description: "Largest model; requires swap, lowest throughput"
|
||||
|
||||
# End of preset configurations — benchmark runner will iterate these.
|
||||
211
tests/test_allegro_benchmarks.py
Normal file
211
tests/test_allegro_benchmarks.py
Normal file
@@ -0,0 +1,211 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smoke tests for Allegro VPS benchmark infrastructure — Issue #95
|
||||
|
||||
Validates the preset configuration and runner entry points without
|
||||
actually contacting a llama-server (no network needed).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
# Add repo root to sys.path
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
|
||||
# ─── Test fixtures ────────────────────────────────────────────────────────────
|
||||
PROFILE_PATH = REPO_ROOT / "profiles" / "allegro-cpu-presets.yaml"
|
||||
BENCHMARK_RUNNER = REPO_ROOT / "benchmarks" / "run_allegro_benchmarks.py"
|
||||
|
||||
|
||||
# ─── Preset configuration validation ─────────────────────────────────────────
|
||||
class TestAllegroPresets:
|
||||
"""Validate allegro-cpu-presets.yaml structure and values."""
|
||||
|
||||
def test_profile_file_exists(self):
|
||||
assert PROFILE_PATH.exists(), f"Profile not found: {PROFILE_PATH}"
|
||||
|
||||
def test_profile_loads_as_yaml(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
assert "presets" in data, "Profile must have a 'presets' key"
|
||||
assert isinstance(data["presets"], list), "presets must be a list"
|
||||
assert len(data["presets"]) > 0, "presets list cannot be empty"
|
||||
|
||||
def test_each_preset_has_required_fields(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
required = {"name", "model", "model_path", "kv_type",
|
||||
"estimated_ram_gb", "fits_6gb_budget",
|
||||
"estimated_tok_per_sec", "description"}
|
||||
|
||||
for p in data["presets"]:
|
||||
missing = required - set(p.keys())
|
||||
assert not missing, f"Preset '{p.get('name','?')}' missing fields: {missing}"
|
||||
|
||||
def test_ram_estimates_are_positive(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
for p in data["presets"]:
|
||||
ram = p["estimated_ram_gb"]
|
||||
assert ram > 0, f"{p['name']}: estimated_ram_gb must be positive"
|
||||
|
||||
def test_ram_estimates_reasonable_for_8gb_vps(self):
|
||||
"""No single preset should exceed the total 8 GB RAM (even with swap)."""
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
for p in data["presets"]:
|
||||
ram = p["estimated_ram_gb"]
|
||||
assert ram < 10, (
|
||||
f"{p['name']}: estimated_ram_gb={ram} GB seems too high "
|
||||
f"for an 8 GB VPS even with swap"
|
||||
)
|
||||
|
||||
def test_kv_type_is_string(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
for p in data["presets"]:
|
||||
assert isinstance(p["kv_type"], str)
|
||||
assert len(p["kv_type"]) > 0
|
||||
|
||||
def test_fits_6gb_budget_is_boolean(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
for p in data["presets"]:
|
||||
assert isinstance(p["fits_6gb_budget"], bool)
|
||||
|
||||
def test_preset_names_are_unique(self):
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
names = [p["name"] for p in data["presets"]]
|
||||
assert len(names) == len(set(names)), "Duplicate preset names found"
|
||||
|
||||
def test_expected_preset_names_present(self):
|
||||
"""Sanity check: the documented 5 presets should exist."""
|
||||
import yaml
|
||||
with open(PROFILE_PATH) as f:
|
||||
data = yaml.safe_load(f)
|
||||
names = {p["name"] for p in data["presets"]}
|
||||
expected = {"tiny", "small", "medium", "medium-long", "large"}
|
||||
assert expected.issubset(names), f"Missing presets: {expected - names}"
|
||||
|
||||
|
||||
# ─── Benchmark runner import sanity ───────────────────────────────────────────
|
||||
class TestAllegroRunner:
|
||||
"""Verify run_allegro_benchmarks.py can be imported and exposes the expected API."""
|
||||
|
||||
def test_runner_file_exists(self):
|
||||
assert BENCHMARK_RUNNER.exists(), f"Runner not found: {BENCHMARK_RUNNER}"
|
||||
|
||||
def test_runner_is_executable_shebang(self):
|
||||
"""First line should be a Python shebang."""
|
||||
with open(BENCHMARK_RUNNER) as f:
|
||||
first = f.readline().strip()
|
||||
assert first.startswith("#!"), "Missing shebang"
|
||||
assert "python" in first.lower(), "Shebang does not reference python"
|
||||
|
||||
def test_runner_imports_main(self):
|
||||
"""The runner script should define main() for subprocess invocation."""
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod) # type: ignore[attr-defined]
|
||||
assert hasattr(mod, "main"), "runner must define a main() function"
|
||||
|
||||
def test_runner_dry_run_invocation(self):
|
||||
"""Subprocess dry-run should exit 0 and print OK."""
|
||||
import subprocess
|
||||
env = os.environ.copy()
|
||||
# Ensure we use the same python as the test runner
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(BENCHMARK_RUNNER), "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, (
|
||||
f"dry-run failed (code {{result.returncode}})\nSTDERR: {{result.stderr}}"
|
||||
)
|
||||
assert "OK" in result.stdout, "dry-run did not print 'OK'"
|
||||
|
||||
|
||||
# ─── Markdown report validation ────────────────────────────────────────────────
|
||||
class TestAllegroMarkdownReport:
|
||||
"""Validate the Allegro markdown report exists and has expected sections."""
|
||||
|
||||
def test_markdown_report_exists(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
assert md_path.exists(), f"Markdown report not found: {md_path}"
|
||||
|
||||
def test_markdown_contains_presets_table(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
assert "| Preset" in content, "Missing presets table header"
|
||||
assert "| tiny" in content, "Missing 'tiny' preset row"
|
||||
assert "| medium" in content, "Missing 'medium' preset row"
|
||||
|
||||
def test_markdown_contains_hardware_spec(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
assert "2 vCPU" in content or "2 cores" in content, "Should mention the Allegro VPS core count"
|
||||
assert "8 GB" in content, "Should mention the Allegro VPS RAM"
|
||||
|
||||
def test_markdown_contains_recommendation(self):
|
||||
md_path = REPO_ROOT / "benchmarks" / "allegro-2026-04-14.md"
|
||||
content = md_path.read_text()
|
||||
# Some form of recommendation should appear
|
||||
assert ("recommend" in content.lower() or
|
||||
"Recommended" in content or
|
||||
"best quality" in content.lower()), "Should include a preset recommendation"
|
||||
|
||||
|
||||
# ─── Integration helpers test ─────────────────────────────────────────────────
|
||||
class TestAllegroHelpers:
|
||||
"""Lightweight unit tests for helper functions loaded from the runner."""
|
||||
|
||||
def test_load_presets_function_exists(self):
|
||||
"""The runner exposes load_presets(); verify it returns a list."""
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod) # type: ignore[attr-defined]
|
||||
presets = mod.load_presets()
|
||||
assert isinstance(presets, list)
|
||||
assert len(presets) >= 5, f"Expected 5 presets, got {{len(presets)}}"
|
||||
|
||||
def test_get_preset_by_name_roundtrip(self):
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"run_allegro_benchmarks", BENCHMARK_RUNNER
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
for expected in ("tiny", "small", "medium"):
|
||||
p = mod.get_preset_by_name(expected)
|
||||
assert p is not None, f"get_preset_by_name('{expected}') returned None"
|
||||
assert p["name"] == expected
|
||||
|
||||
|
||||
# ─── Entry point ───────────────────────────────────────────────────────────────
|
||||
if __name__ == "__main__":
|
||||
# Allow running as `python tests/test_allegro_benchmarks.py` for quick smoke.
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user