Compare commits

...

4 Commits

Author SHA1 Message Date
45840c1b70 test: add Allegro benchmark and preset tests (#95)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 19s
2026-04-16 01:56:15 +00:00
d603a1b053 docs: Allegro VPS benchmark analysis — expected results (#95) 2026-04-16 01:54:53 +00:00
f3a5be5638 feat: add Allegro VPS benchmark runner (#95) 2026-04-16 01:53:49 +00:00
70d292c222 feat: add Allegro VPS preset configurations (#95) 2026-04-16 01:50:50 +00:00
4 changed files with 930 additions and 0 deletions

View File

@@ -0,0 +1,113 @@
# Allegro VPS Benchmark Analysis — 2026-04-14
## Hardware
| Spec | Value |
|------|-------|
| Hostname | allegro |
| IP | 167.99.126.228 |
| Cores | 2 |
| RAM | 8GB |
| GPU | No (CPU-only) |
| Arch | x86_64 |
| Available for model | ~6GB (2GB reserved for OS + hermes agent) |
## Preset Analysis
Based on GGUF model sizes and TurboQuant KV cache memory math.
### Memory Budget
```
Total RAM: 8,192 MB
OS + hermes agent: -2,048 MB
Available: 6,144 MB
```
### Preset Memory Estimates
| Preset | Model Size | Context | KV Type | KV Cache | Total Est. | Fits? |
|--------|-----------|---------|---------|----------|------------|-------|
| tiny-2b-q4 | 1,536 MB | 4K | f16 | 256 MB | ~2,800 MB | YES |
| small-3b-q4 | 2,048 MB | 8K | turbo2 | 512 MB | ~3,600 MB | YES |
| medium-7b-q4 | 4,096 MB | 8K | turbo4 | 384 MB | ~5,200 MB | YES |
| medium-7b-q4-long | 4,096 MB | 32K | turbo4 | 1,024 MB | ~5,800 MB | YES |
| large-14b-q3 | 6,656 MB | 4K | turbo4 | 320 MB | ~7,200 MB | NO* |
*Large preset needs swap or will OOM. Usable for batch jobs with `--mlock` disabled.
### Estimated Performance (CPU-only, 2 cores)
These are theoretical estimates based on model size and CPU throughput.
Actual results depend on prompt length, generation length, and system load.
| Preset | Est. tok/s | Est. TTFT | Use Case |
|--------|-----------|-----------|----------|
| tiny-2b-q4 | 8-15 | 1.5-3.0s | Simple Q&A, triage, short completions |
| small-3b-q4 | 5-10 | 2.0-5.0s | Code gen, tool calling, burn-loop workers |
| medium-7b-q4 | 2-5 | 4.0-8.0s | Reasoning, multi-turn conversation |
| medium-7b-q4-long | 1.5-4 | 6.0-12.0s | Long docs, code review, research |
| large-14b-q3 | 0.5-2 | 10-30s | Batch processing only (needs swap) |
## Recommendation
**Default: `medium` (7B Q4 + TurboQuant)**
- Best quality that fits comfortably in 6GB budget
- 2-5 tok/s is usable for interactive work (burn-loop, conversation)
- TurboQuant KV4 keeps 8K context at ~384MB cache
**For burn-loop workers: `small` (3B Q4 + TurboQuant2)**
- 5-10 tok/s is better for high-throughput batch work
- Lower memory footprint leaves room for multiple workers
**For long documents: `medium-long` (7B Q4 + TurboQuant4, 32K)**
- 32K context for code review, research papers
- Stays within 6GB budget with q3_k KV compression
## Server Startup Commands
### Ollama (simplest)
```bash
# Tiny
ollama pull qwen2.5:1.5b
# Small
ollama pull qwen2.5:3b
# Medium (recommended)
ollama pull qwen2.5:7b
```
### llama-server with TurboQuant
```bash
# Medium preset
export TURBO_LAYER_ADAPTIVE=7
llama-server \
-m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
--port 8081 \
-t 2 \
-c 8192 \
-b 512 \
-ctk q4_0 -ctv q4_0 \
--host 0.0.0.0
```
### Run Benchmarks
```bash
# All presets
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
# Specific preset
python3 benchmarks/run_allegro_benchmarks.py --preset medium \
--url http://localhost:11434
```
## Next Steps
1. Run benchmarks on Allegro VPS: `python3 benchmarks/run_allegro_benchmarks.py --all --markdown`
2. Update this document with actual measured results
3. Set `recommended_preset` based on measured performance
4. Create hermes profile for each viable preset

View File

@@ -0,0 +1,512 @@
#!/usr/bin/env python3
"""
Allegro VPS Benchmark Runner — TurboQuant presets on 2 cores, 8GB RAM.
Runs each preset from profiles/allegro-cpu-presets.yaml against the
benchmark prompts, measuring tokens/sec, latency, TTFT, and memory.
Designed for CPU-only inference (no GPU) on the Allegro VPS.
Usage:
# Run all presets
python3 benchmarks/run_allegro_benchmarks.py --all
# Run specific preset
python3 benchmarks/run_allegro_benchmarks.py --preset medium
# Dry run (validate config, no inference)
python3 benchmarks/run_allegro_benchmarks.py --dry-run
# Output markdown report
python3 benchmarks/run_allegro_benchmarks.py --all --markdown
# Against remote Ollama
python3 benchmarks/run_allegro_benchmarks.py --preset small \
--url http://167.99.126.228:11434
"""
import argparse
import json
import os
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
ROOT = Path(__file__).resolve().parents[1]
PRESETS_FILE = ROOT / "profiles" / "allegro-cpu-presets.yaml"
PROMPTS_FILE = ROOT / "benchmarks" / "prompts.json"
RESULTS_DIR = ROOT / "benchmarks"
try:
import requests
except ImportError:
requests = None
# ── Hardware Detection ────────────────────────────────────────────────────
def detect_hardware() -> dict:
"""Detect current hardware specs."""
info = {
"hostname": "",
"cores": os.cpu_count() or 0,
"ram_gb": 0,
"gpu": False,
"arch": "",
}
try:
import platform
info["hostname"] = platform.node()
info["arch"] = platform.machine()
except Exception:
pass
# RAM detection (Linux)
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal:"):
kb = int(line.split()[1])
info["ram_gb"] = round(kb / 1024 / 1024, 1)
break
except Exception:
# macOS fallback
try:
result = subprocess.run(["sysctl", "-n", "hw.memsize"],
capture_output=True, text=True)
bytes_val = int(result.stdout.strip())
info["ram_gb"] = round(bytes_val / 1024**3, 1)
except Exception:
pass
# GPU detection
try:
result = subprocess.run(["nvidia-smi", "--query-gpu=name",
"--format=csv,noheader"],
capture_output=True, text=True, timeout=5)
if result.returncode == 0 and result.stdout.strip():
info["gpu"] = True
except Exception:
pass
return info
def get_memory_usage_gb() -> float:
"""Get current process RSS in GB."""
try:
if sys.platform == "darwin":
result = subprocess.run(["ps", "-o", "rss=", "-p", str(os.getpid())],
capture_output=True, text=True)
return int(result.stdout.strip()) / 1024 / 1024
else:
with open(f"/proc/{os.getpid()}/status") as f:
for line in f:
if line.startswith("VmRSS:"):
return int(line.split()[1]) / 1024 / 1024
except Exception:
pass
return 0.0
def get_system_memory_gb() -> float:
"""Get available system memory in GB."""
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemAvailable:"):
kb = int(line.split()[1])
return round(kb / 1024 / 1024, 2)
except Exception:
pass
return 0.0
# ── Preset Loading ────────────────────────────────────────────────────────
def load_presets() -> dict:
"""Load preset configuration from YAML."""
try:
import yaml
with open(PRESETS_FILE) as f:
return yaml.safe_load(f)
except ImportError:
# Fallback: parse basic YAML manually
import re
with open(PRESETS_FILE) as f:
content = f.read()
# Very basic YAML parsing — just enough to extract preset names
presets = {}
current = None
for line in content.split("\n"):
m = re.match(r"^ (\w+):$", line)
if m and line.startswith(" "):
current = m.group(1)
presets[current] = {"name": current}
return {"presets": presets}
def load_prompts() -> list:
"""Load benchmark prompts."""
with open(PROMPTS_FILE) as f:
return json.load(f)
# ── Inference Backends ────────────────────────────────────────────────────
def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
"""Run inference against Ollama."""
if requests is None:
return {"status": "failed", "error": "requests not installed"}
api_url = f"{url.rstrip('/')}/api/generate"
start = time.time()
mem_before = get_memory_usage_gb()
sys_mem_before = get_system_memory_gb()
try:
resp = requests.post(api_url, json={
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": 256}
}, timeout=timeout)
elapsed = time.time() - start
mem_after = get_memory_usage_gb()
sys_mem_after = get_system_memory_gb()
resp.raise_for_status()
data = resp.json()
response_text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
prompt_eval_ns = data.get("prompt_eval_duration", 0)
tok_per_sec = 0.0
ttft = None
if eval_duration_ns > 0:
tok_per_sec = eval_count / (eval_duration_ns / 1e9)
if prompt_eval_ns > 0:
ttft = prompt_eval_ns / 1e9
return {
"response": response_text[:200],
"latency_s": round(elapsed, 3),
"ttft_s": round(ttft, 3) if ttft else None,
"tokens_per_sec": round(tok_per_sec, 2),
"eval_count": eval_count,
"memory_gb": round(max(mem_before, mem_after), 2),
"system_mem_available_gb": round(sys_mem_after, 2),
"system_mem_delta_gb": round(sys_mem_before - sys_mem_after, 2),
"status": "success",
}
except Exception as e:
return {
"status": "failed",
"error": str(e)[:200],
"latency_s": round(time.time() - start, 3),
}
def run_llama_server(prompt: str, model: str, url: str,
kv_type: str = "f16", timeout: int = 120) -> dict:
"""Run inference against llama-server (OpenAI-compatible)."""
if requests is None:
return {"status": "failed", "error": "requests not installed"}
api_url = f"{url.rstrip('/')}/v1/chat/completions"
start = time.time()
mem_before = get_memory_usage_gb()
sys_mem_before = get_system_memory_gb()
try:
resp = requests.post(api_url, json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 256,
"stream": False,
}, timeout=timeout)
elapsed = time.time() - start
mem_after = get_memory_usage_gb()
sys_mem_after = get_system_memory_gb()
resp.raise_for_status()
data = resp.json()
choice = data.get("choices", [{}])[0]
response_text = choice.get("message", {}).get("content", "")
usage = data.get("usage", {})
completion_tokens = usage.get("completion_tokens", 0)
tok_per_sec = 0.0
if elapsed > 0 and completion_tokens > 0:
tok_per_sec = completion_tokens / max(elapsed - 0.1, 0.01)
return {
"response": response_text[:200],
"latency_s": round(elapsed, 3),
"ttft_s": None,
"tokens_per_sec": round(tok_per_sec, 2),
"completion_tokens": completion_tokens,
"kv_type": kv_type,
"memory_gb": round(max(mem_before, mem_after), 2),
"system_mem_available_gb": round(sys_mem_after, 2),
"system_mem_delta_gb": round(sys_mem_before - sys_mem_after, 2),
"status": "success",
}
except Exception as e:
return {
"status": "failed",
"error": str(e)[:200],
"latency_s": round(time.time() - start, 3),
}
# ── Benchmark Runner ──────────────────────────────────────────────────────
def run_preset(preset: dict, backend: str, url: str, prompts: list,
timeout: int = 120, dry_run: bool = False) -> dict:
"""Run a single preset against all prompts."""
name = preset.get("name", "unknown")
model = preset.get("ollama_model", "") if backend == "ollama" else preset.get("llama_cpp_model", "")
kv_type = preset.get("kv_type", "f16")
run_fn = run_ollama if backend == "ollama" else run_llama_server
print(f"\nPreset: {name} (model={model}, kv={kv_type})")
print(f" Estimated RAM: {preset.get('estimated_ram_gb', '?')}GB | "
f"Fits Allegro: {preset.get('fits_in_allegro', '?')}")
if dry_run:
print(f" [DRY RUN] Skipping inference")
return {"preset": name, "status": "dry_run", "results": []}
results = []
for item in prompts:
pid = item.get("id", item.get("category", "unknown"))
prompt = item["prompt"]
print(f" [{pid}] ...", end=" ", flush=True)
if backend == "ollama":
result = run_fn(prompt, model, url, timeout=timeout)
else:
result = run_fn(prompt, model, url, kv_type=kv_type, timeout=timeout)
result["id"] = pid
result["prompt_preview"] = prompt[:80]
results.append(result)
status = "OK" if result["status"] == "success" else "FAIL"
tps = result.get("tokens_per_sec", 0)
lat = result.get("latency_s", 0)
mem = result.get("system_mem_available_gb", 0)
print(f"{status} {tps:.1f} tok/s {lat:.1f}s mem={mem:.1f}GB")
# Summary
successes = [r for r in results if r["status"] == "success"]
summary = {
"preset": name,
"model": model,
"kv_type": kv_type,
"total": len(results),
"success": len(successes),
"failed": len(results) - len(successes),
"avg_tok_per_sec": round(
sum(r.get("tokens_per_sec", 0) for r in successes) / max(len(successes), 1), 2
),
"avg_latency_s": round(
sum(r.get("latency_s", 0) for r in successes) / max(len(successes), 1), 3
),
"peak_memory_gb": round(
max((r.get("memory_gb", 0) for r in results), default=0), 2
),
"min_system_mem_available_gb": round(
min((r.get("system_mem_available_gb", 999) for r in results), default=0), 2
),
"results": results,
}
print(f" SUMMARY: {summary['success']}/{summary['total']} OK | "
f"Avg {summary['avg_tok_per_sec']:.1f} tok/s | "
f"Peak {summary['peak_memory_gb']:.1f}GB | "
f"Min avail {summary['min_system_mem_available_gb']:.1f}GB")
return summary
def generate_report(all_results: list, hw_info: dict, output_dir: str) -> str:
"""Generate markdown benchmark report."""
today = datetime.now().strftime("%Y-%m-%d")
lines = [
f"# Allegro VPS Benchmark Results — {today}",
"",
"## Hardware",
"",
f"| Spec | Value |",
f"|------|-------|",
f"| Hostname | {hw_info.get('hostname', 'unknown')} |",
f"| Cores | {hw_info.get('cores', '?')} |",
f"| RAM | {hw_info.get('ram_gb', '?')}GB |",
f"| GPU | {'Yes' if hw_info.get('gpu') else 'No (CPU-only)'} |",
f"| Arch | {hw_info.get('arch', '?')} |",
"",
"## Results Summary",
"",
"| Preset | Model | KV | tok/s | Latency (s) | Peak Mem (GB) | Status |",
"|--------|-------|-----|-------|-------------|---------------|--------|",
]
for r in all_results:
status = "PASS" if r["success"] == r["total"] else f"{r['success']}/{r['total']}"
lines.append(
f"| {r['preset']} | {r['model']} | {r['kv_type']} | "
f"{r['avg_tok_per_sec']} | {r['avg_latency_s']} | "
f"{r['peak_memory_gb']} | {status} |"
)
# Find minimum viable preset
viable = [r for r in all_results
if r["success"] == r["total"]
and r.get("min_system_mem_available_gb", 0) > 1.0]
if viable:
best = min(viable, key=lambda x: x["peak_memory_gb"])
lines.extend([
"",
"## Minimum Viable Preset",
"",
f"**{best['preset']}** ({best['model']}, {best['kv_type']})",
f"- Peak memory: {best['peak_memory_gb']}GB",
f"- Min available system memory: {best['min_system_mem_available_gb']}GB",
f"- Avg performance: {best['avg_tok_per_sec']} tok/s",
"",
"Fits within the 6GB budget (8GB - 2GB OS reserve).",
])
else:
lines.extend([
"",
"## Minimum Viable Preset",
"",
"No preset passed all tests with >1GB system memory headroom.",
"Recommendation: use `tiny` or `small` presets.",
])
lines.extend([
"",
"## Per-Preset Details",
"",
])
for r in all_results:
lines.extend([
f"### {r['preset']}",
"",
f"- Model: `{r['model']}`",
f"- KV type: `{r['kv_type']}`",
f"- Avg tok/s: {r['avg_tok_per_sec']}",
f"- Avg latency: {r['avg_latency_s']}s",
f"- Peak memory: {r['peak_memory_gb']}GB",
"",
"| Prompt | tok/s | Latency (s) | Status |",
"|--------|-------|-------------|--------|",
])
for res in r.get("results", []):
pid = res.get("id", "?")
tps = res.get("tokens_per_sec", 0)
lat = res.get("latency_s", 0)
st = res.get("status", "?")
lines.append(f"| {pid} | {tps} | {lat} | {st} |")
lines.append("")
report = "\n".join(lines)
output_path = os.path.join(output_dir, f"allegro-{today}.md")
with open(output_path, "w") as f:
f.write(report)
print(f"\nReport saved to {output_path}")
return report
# ── CLI ───────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Allegro VPS Benchmark Runner")
parser.add_argument("--all", action="store_true", help="Run all presets")
parser.add_argument("--preset", help="Run a specific preset")
parser.add_argument("--backend", choices=["ollama", "llama-server"],
default="ollama", help="Inference backend")
parser.add_argument("--url", default="http://localhost:11434",
help="Backend URL")
parser.add_argument("--prompts", default=None, help="Prompts file")
parser.add_argument("--timeout", type=int, default=120,
help="Per-prompt timeout (s)")
parser.add_argument("--dry-run", action="store_true",
help="Validate config without inference")
parser.add_argument("--markdown", action="store_true",
help="Generate markdown report")
parser.add_argument("--json", dest="json_output", action="store_true",
help="JSON output")
args = parser.parse_args()
if not args.all and not args.preset:
parser.error("Specify --all or --preset <name>")
# Load config
config = load_presets()
presets = config.get("presets", {})
prompts_file = args.prompts or str(PROMPTS_FILE)
prompts = load_prompts() if os.path.exists(prompts_file) else []
# Hardware info
hw_info = detect_hardware()
print(f"Hardware: {hw_info['cores']} cores, {hw_info['ram_gb']}GB RAM, "
f"{'GPU' if hw_info['gpu'] else 'CPU-only'}")
# Determine which presets to run
if args.all:
preset_names = list(presets.keys())
else:
preset_names = [args.preset]
all_results = []
for pname in preset_names:
if pname not in presets:
print(f"Unknown preset: {pname}")
continue
preset = presets[pname]
result = run_preset(preset, args.backend, args.url, prompts,
timeout=args.timeout, dry_run=args.dry_run)
all_results.append(result)
# Output
if args.json_output:
print(json.dumps(all_results, indent=2))
elif args.markdown:
generate_report(all_results, hw_info, str(RESULTS_DIR))
else:
# Summary table
print(f"\n{'='*70}")
print(f"{'Preset':<20} {'Model':<25} {'tok/s':<8} {'Lat(s)':<8} {'Mem(GB)':<8}")
print(f"{'-'*70}")
for r in all_results:
print(f"{r['preset']:<20} {r.get('model','?'):<25} "
f"{r.get('avg_tok_per_sec',0):<8} "
f"{r.get('avg_latency_s',0):<8} "
f"{r.get('peak_memory_gb',0):<8}")
print(f"{'='*70}")
# Save raw results
ts = int(time.time())
raw_path = str(RESULTS_DIR / f"allegro_results_{ts}.json")
os.makedirs(os.path.dirname(raw_path), exist_ok=True)
with open(raw_path, "w") as f:
json.dump({"hardware": hw_info, "results": all_results}, f, indent=2)
print(f"Raw results: {raw_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,164 @@
# Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference
# Optimized for the Timmy Foundation Allegro server (167.99.126.228)
#
# Hardware constraints:
# - 2 CPU cores (no GPU)
# - 8GB RAM total
# - ~2GB reserved for OS + hermes agent
# - ~6GB available for model + KV cache
#
# Strategy: GGUF quantization via llama.cpp (CPU-optimized)
# KV cache compression via TurboQuant to maximize context within RAM
hardware:
hostname: "allegro"
ip: "167.99.126.228"
cores: 2
ram_gb: 8
gpu: false
os_reserved_gb: 2
available_gb: 6
arch: "x86_64"
cpu_backend: "llama.cpp"
presets:
# ─── TIER 1: Conservative (fits comfortably) ──────────────────────
tiny:
name: "tiny-2b-q4"
description: "2B param model, Q4_K_M — leaves headroom for other processes"
model_size_gb: 1.5
quantization: "Q4_K_M"
context_tokens: 4096
kv_type: "f16"
estimated_ram_gb: 2.8
fits_in_allegro: true
server_flags:
threads: 2
context: 4096
batch: 256
expected_perf:
tokens_per_sec: "8-15"
ttft_s: "1.5-3.0"
use_case: "Simple Q&A, short completions, triage"
ollama_model: "qwen2.5:1.5b"
llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
small:
name: "small-3b-q4"
description: "3B param model, Q4_K_M — sweet spot for value on 2 cores"
model_size_gb: 2.0
quantization: "Q4_K_M"
context_tokens: 8192
kv_type: "turbo2"
estimated_ram_gb: 3.6
fits_in_allegro: true
server_flags:
threads: 2
context: 8192
batch: 512
ctk: "q4_0"
ctv: "q4_0"
expected_perf:
tokens_per_sec: "5-10"
ttft_s: "2.0-5.0"
use_case: "Code generation, tool calling, burn-loop workers"
ollama_model: "qwen2.5:3b"
llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf"
# ─── TIER 2: Balanced (recommended default) ───────────────────────
medium:
name: "medium-7b-q4"
description: "7B param model, Q4_K_M + TurboQuant — best quality that fits"
model_size_gb: 4.1
quantization: "Q4_K_M"
context_tokens: 8192
kv_type: "turbo4"
estimated_ram_gb: 5.2
fits_in_allegro: true
server_flags:
threads: 2
context: 8192
batch: 512
ctk: "q4_0"
ctv: "q4_0"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "2-5"
ttft_s: "4.0-8.0"
use_case: "Complex reasoning, multi-turn conversation, analysis"
ollama_model: "qwen2.5:7b"
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
medium_long:
name: "medium-7b-q4-long"
description: "7B Q4 + aggressive TurboQuant for 32K context"
model_size_gb: 4.1
quantization: "Q4_K_M"
context_tokens: 32768
kv_type: "turbo4"
estimated_ram_gb: 5.8
fits_in_allegro: true
server_flags:
threads: 2
context: 32768
batch: 256
ctk: "q3_k"
ctv: "q3_k"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "1.5-4"
ttft_s: "6.0-12.0"
use_case: "Long document analysis, code review, research"
ollama_model: "qwen2.5:7b"
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
# ─── TIER 3: Pushing limits (may swap) ────────────────────────────
large:
name: "large-14b-q3"
description: "14B param model, Q3_K_M — may page to swap, use with caution"
model_size_gb: 6.5
quantization: "Q3_K_M"
context_tokens: 4096
kv_type: "turbo4"
estimated_ram_gb: 7.2
fits_in_allegro: false
warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs."
server_flags:
threads: 2
context: 4096
batch: 256
ctk: "q3_k"
ctv: "q3_k"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "0.5-2"
ttft_s: "10.0-30.0"
use_case: "Batch processing, overnight jobs (with swap)"
ollama_model: "qwen2.5:14b"
llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf"
# Recommended default for Allegro
recommended_preset: "medium"
# Server startup examples
examples:
ollama: |
# Pull and run
ollama pull qwen2.5:7b
ollama run qwen2.5:7b
llama_cpp: |
# With TurboQuant KV cache
export TURBO_LAYER_ADAPTIVE=7
llama-server \
-m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
--port 8081 \
-t 2 \
-c 8192 \
-b 512 \
-ctk q4_0 -ctv q4_0 \
--host 0.0.0.0
hermes_profile: |
# Use with hermes agent
hermes -p allegro-medium chat

View File

@@ -0,0 +1,141 @@
"""Tests for Allegro VPS benchmark runner and preset configuration."""
import json
import os
import pathlib
import sys
import pytest
ROOT = pathlib.Path(__file__).resolve().parents[1]
PRESETS_FILE = ROOT / "profiles" / "allegro-cpu-presets.yaml"
PROMPTS_FILE = ROOT / "benchmarks" / "prompts.json"
sys.path.insert(0, str(ROOT / "benchmarks"))
# ---------------------------------------------------------------------------
# Preset config validation
# ---------------------------------------------------------------------------
class TestPresetConfig:
"""Validate allegro-cpu-presets.yaml structure."""
@classmethod
def setUpClass(cls):
import yaml
cls.config = yaml.safe_load(PRESETS_FILE.read_text())
def test_config_has_hardware(self):
assert "hardware" in self.config
hw = self.config["hardware"]
assert hw["cores"] == 2
assert hw["ram_gb"] == 8
assert hw["gpu"] is False
def test_config_has_presets(self):
assert "presets" in self.config
assert len(self.config["presets"]) >= 3
def test_each_preset_has_required_fields(self):
for name, preset in self.config["presets"].items():
assert "name" in preset, f"Preset {name} missing 'name'"
assert "description" in preset, f"Preset {name} missing 'description'"
assert "model_size_gb" in preset, f"Preset {name} missing 'model_size_gb'"
assert "quantization" in preset, f"Preset {name} missing 'quantization'"
assert "context_tokens" in preset, f"Preset {name} missing 'context_tokens'"
assert "kv_type" in preset, f"Preset {name} missing 'kv_type'"
assert "estimated_ram_gb" in preset, f"Preset {name} missing 'estimated_ram_gb'"
assert "fits_in_allegro" in preset, f"Preset {name} missing 'fits_in_allegro'"
assert "expected_perf" in preset, f"Preset {name} missing 'expected_perf'"
assert "server_flags" in preset, f"Preset {name} missing 'server_flags'"
def test_tiny_fits_in_allegro(self):
tiny = self.config["presets"]["tiny"]
assert tiny["fits_in_allegro"] is True
assert tiny["estimated_ram_gb"] <= 6.0
def test_small_fits_in_allegro(self):
small = self.config["presets"]["small"]
assert small["fits_in_allegro"] is True
assert small["estimated_ram_gb"] <= 6.0
def test_medium_fits_in_allegro(self):
medium = self.config["presets"]["medium"]
assert medium["fits_in_allegro"] is True
assert medium["estimated_ram_gb"] <= 6.0
def test_large_does_not_fit(self):
large = self.config["presets"]["large"]
assert large["fits_in_allegro"] is False
assert large["estimated_ram_gb"] > 6.0
def test_recommended_preset_exists(self):
rec = self.config.get("recommended_preset")
assert rec is not None
assert rec in self.config["presets"]
def test_server_flags_have_threads(self):
for name, preset in self.config["presets"].items():
flags = preset.get("server_flags", {})
assert "threads" in flags, f"Preset {name} missing threads in server_flags"
assert flags["threads"] == 2, f"Preset {name} should use 2 threads"
def test_context_tokens_reasonable(self):
for name, preset in self.config["presets"].items():
ctx = preset["context_tokens"]
assert ctx >= 2048, f"Preset {name} context too small: {ctx}"
assert ctx <= 131072, f"Preset {name} context too large: {ctx}"
def test_kv_types_valid(self):
valid_types = {"f16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
"turbo2", "turbo3", "turbo4", "q3_k", "q4_k", "q5_k"}
for name, preset in self.config["presets"].items():
kv = preset["kv_type"]
assert kv in valid_types, f"Preset {name} has invalid kv_type: {kv}"
# ---------------------------------------------------------------------------
# Benchmark prompts validation
# ---------------------------------------------------------------------------
class TestBenchmarkPrompts:
def test_prompts_file_exists(self):
assert PROMPTS_FILE.exists()
def test_prompts_is_list(self):
prompts = json.loads(PROMPTS_FILE.read_text())
assert isinstance(prompts, list)
assert len(prompts) >= 5
def test_each_prompt_has_required_fields(self):
prompts = json.loads(PROMPTS_FILE.read_text())
for p in prompts:
assert "id" in p or "category" in p
assert "prompt" in p
assert len(p["prompt"]) > 10
# ---------------------------------------------------------------------------
# Hardware detection (unit tests)
# ---------------------------------------------------------------------------
class TestHardwareDetection:
def test_detect_hardware_returns_dict(self):
from run_allegro_benchmarks import detect_hardware
hw = detect_hardware()
assert isinstance(hw, dict)
assert "cores" in hw
assert "ram_gb" in hw
assert "gpu" in hw
def test_cores_positive(self):
from run_allegro_benchmarks import detect_hardware
hw = detect_hardware()
assert hw["cores"] > 0
def test_memory_usage_returns_float(self):
from run_allegro_benchmarks import get_memory_usage_gb
mem = get_memory_usage_gb()
assert isinstance(mem, (int, float))
assert mem >= 0