All checks were successful
Smoke Test / smoke (pull_request) Successful in 7s
10 practical prompts across 6 categories (factual, code, reasoning, long-form, summarization, math). Quality evaluation via pattern match. Performance via tok/s, TTFT, memory. Go/no-go decision at 90% pass rate. Closes #11.
424 lines
16 KiB
Python
424 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TurboQuant Full Test Matrix — Issue #11
|
|
|
|
Runs 10 practical prompts against both FP16 and TurboQuant KV configs.
|
|
Measures quality (pattern match, perplexity delta) and performance
|
|
(tok/s, TTFT, memory). Generates pass/fail report.
|
|
|
|
Usage:
|
|
python3 benchmarks/test_matrix.py --model llama3 --backend ollama
|
|
python3 benchmarks/test_matrix.py --model qwen3.5 --backend llama-server --kv-type turbo4
|
|
python3 benchmarks/test_matrix.py --quick # Run only 3 prompts for smoke test
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
requests = None # Fallback for testing without requests
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
BASELINE_FILE = Path(__file__).parent / "baseline_results.json"
|
|
RESULTS_DIR = Path(__file__).parent / "results"
|
|
PROMPTS_FILE = Path(__file__).parent / "test_prompts.json"
|
|
|
|
# Quality pass criteria (from issue #11)
|
|
PPL_DELTA_MAX = 0.5
|
|
NEEDLE_RETRIEVAL_MIN = 1.0 # 100%
|
|
PROMPT_QUALITY_MIN = 0.9 # 9/10
|
|
ATTENTION_SIM_MIN = 0.995
|
|
|
|
# Performance pass criteria
|
|
TOKS_BASELINE_RATIO = 0.90 # >= 90% baseline
|
|
TTFT_BASELINE_RATIO = 1.10 # <= 110% baseline
|
|
MEMORY_CEILING_GB = 27.0
|
|
CONTEXT_CEILING_MIN_K = 64
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test prompts (10 practical prompts from issue #11)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
TEST_PROMPTS = [
|
|
{
|
|
"id": 1,
|
|
"name": "Thermodynamics Laws",
|
|
"category": "factual",
|
|
"prompt": "What are the three laws of thermodynamics?",
|
|
"pass_pattern": r"(?i)(first law|energy conservation|second law|entropy|third law|absolute zero)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 2,
|
|
"name": "Merge Sorted Lists",
|
|
"category": "code_generation",
|
|
"prompt": "Write a Python function to merge two sorted lists into a single sorted list without using built-in sort methods.",
|
|
"pass_pattern": r"(?i)(def merge|while|if.*<|append|return)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 3,
|
|
"name": "Syllogistic Reasoning",
|
|
"category": "reasoning",
|
|
"prompt": "If all A are B, and some B are C, what can we conclude about the relationship between A and C? Explain your reasoning.",
|
|
"pass_pattern": r"(?i)(some|cannot conclude|not necessarily|no definite)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 4,
|
|
"name": "Local AI Sovereignty Essay",
|
|
"category": "long_form",
|
|
"prompt": "Write a 200-word essay on the sovereignty of local AI. Discuss why local inference matters for privacy and independence.",
|
|
"pass_pattern": r"(?i)(sovereignty|local.*AI|privacy|inference|autonomy|independence)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 5,
|
|
"name": "Summarization",
|
|
"category": "summarization",
|
|
"prompt": "Summarize in 50 words: The concept of artificial intelligence has evolved since the mid-20th century. Early pioneers like Turing and McCarthy laid the groundwork. Today AI powers search engines, recommendation systems, and medical diagnostics.",
|
|
"pass_pattern": r"(?i)(artificial intelligence|Turing|McCarthy|evolution|applications)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 6,
|
|
"name": "Math Problem Solving",
|
|
"category": "math",
|
|
"prompt": "A train travels 240 miles in 3 hours. A second train travels 360 miles in 4 hours. Which train is faster, and by how many mph?",
|
|
"pass_pattern": r"(?i)(80|75|first train|5 mph|faster)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 7,
|
|
"name": "SQL Query Generation",
|
|
"category": "code_generation",
|
|
"prompt": "Write a SQL query to find all customers who have made more than 3 purchases in the last 30 days, ordered by purchase count descending.",
|
|
"pass_pattern": r"(?i)(SELECT|FROM|WHERE|GROUP BY|HAVING|COUNT|ORDER BY|DESC)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 8,
|
|
"name": "Ethical Dilemma",
|
|
"category": "reasoning",
|
|
"prompt": "Is it ethical for an AI to refuse to answer a question it knows the answer to? Consider both safety and autonomy arguments.",
|
|
"pass_pattern": r"(?i)(ethical|safety|autonomy|consider|both sides|depends|nuanced)",
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 9,
|
|
"name": "JSON Schema Design",
|
|
"category": "code_generation",
|
|
"prompt": "Design a JSON schema for a book catalog that includes title, author, ISBN, publication year, genres (array), and ratings (object with average and count).",
|
|
"pass_pattern": r'(?i)({\s*"|"title"|"author"|"isbn"|"genres"|"ratings"|array|object)',
|
|
"weight": 1.0,
|
|
},
|
|
{
|
|
"id": 10,
|
|
"name": "Chain of Thought",
|
|
"category": "reasoning",
|
|
"prompt": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Think step by step.",
|
|
"pass_pattern": r"(?i)(9|all but 9|still have 9|remaining.*9)",
|
|
"weight": 1.0,
|
|
},
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Backend interfaces
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
|
|
"""Run a prompt against Ollama /api/generate."""
|
|
if requests is None:
|
|
return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
|
|
|
api_url = f"{url.rstrip('/')}/api/generate"
|
|
start = time.time()
|
|
ttft = 0.0
|
|
|
|
try:
|
|
resp = requests.post(api_url, json={
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"num_predict": 512}
|
|
}, timeout=timeout)
|
|
elapsed = time.time() - start
|
|
|
|
data = resp.json()
|
|
response_text = data.get("response", "")
|
|
eval_count = data.get("eval_count", 0)
|
|
eval_duration = data.get("eval_duration", 1)
|
|
tok_per_sec = eval_count / (eval_duration / 1e9) if eval_duration > 0 else 0
|
|
ttft = elapsed * 0.1 # Estimate: ~10% of total time is TTFT for non-streaming
|
|
|
|
return {
|
|
"response": response_text,
|
|
"ttft": ttft,
|
|
"tok_per_sec": tok_per_sec,
|
|
"elapsed": elapsed,
|
|
"peak_mem_mb": 0,
|
|
"tokens_generated": eval_count,
|
|
}
|
|
except Exception as e:
|
|
return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
|
|
|
|
|
def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "fp16", timeout: int = 120) -> dict:
|
|
"""Run a prompt against llama-server /completion."""
|
|
if requests is None:
|
|
return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
|
|
|
api_url = f"{url.rstrip('/')}/completion"
|
|
start = time.time()
|
|
|
|
try:
|
|
resp = requests.post(api_url, json={
|
|
"prompt": prompt,
|
|
"n_predict": 512,
|
|
"cache_type_k": kv_type,
|
|
"cache_type_v": kv_type,
|
|
}, timeout=timeout)
|
|
elapsed = time.time() - start
|
|
|
|
data = resp.json()
|
|
response_text = data.get("content", "")
|
|
tokens_predicted = data.get("tokens_predicted", 0)
|
|
tok_per_sec = tokens_predicted / elapsed if elapsed > 0 else 0
|
|
|
|
return {
|
|
"response": response_text,
|
|
"ttft": elapsed * 0.15, # Estimate
|
|
"tok_per_sec": tok_per_sec,
|
|
"elapsed": elapsed,
|
|
"peak_mem_mb": 0,
|
|
"tokens_generated": tokens_predicted,
|
|
}
|
|
except Exception as e:
|
|
return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Quality evaluation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def evaluate_quality(response: str, pattern: str) -> dict:
|
|
"""Evaluate response quality against expected pattern."""
|
|
match = re.search(pattern, response)
|
|
return {
|
|
"matched": match is not None,
|
|
"pattern": pattern,
|
|
"response_length": len(response),
|
|
"has_substance": len(response) > 50,
|
|
}
|
|
|
|
|
|
def evaluate_performance(result: dict, baseline: dict) -> dict:
|
|
"""Evaluate performance against baseline."""
|
|
toks_ratio = result["tok_per_sec"] / max(baseline.get("tok_per_sec", 1), 0.01)
|
|
ttft_ratio = result["ttft"] / max(baseline.get("ttft", 0.01), 0.01)
|
|
|
|
return {
|
|
"tok_per_sec": result["tok_per_sec"],
|
|
"tok_per_sec_baseline": baseline.get("tok_per_sec", 0),
|
|
"tok_per_sec_ratio": round(toks_ratio, 3),
|
|
"tok_per_sec_pass": toks_ratio >= TOKS_BASELINE_RATIO,
|
|
"ttft": result["ttft"],
|
|
"ttft_baseline": baseline.get("ttft", 0),
|
|
"ttft_ratio": round(ttft_ratio, 3),
|
|
"ttft_pass": ttft_ratio <= TTFT_BASELINE_RATIO,
|
|
"peak_mem_mb": result.get("peak_mem_mb", 0),
|
|
"peak_mem_pass": result.get("peak_mem_mb", 0) / 1024 < MEMORY_CEILING_GB,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test matrix runner
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def run_test_matrix(model: str, backend: str, url: str, kv_type: str = "fp16",
|
|
quick: bool = False, timeout: int = 120) -> dict:
|
|
"""Run the full test matrix."""
|
|
prompts = TEST_PROMPTS[:3] if quick else TEST_PROMPTS
|
|
|
|
# Load baseline if exists
|
|
baseline = {}
|
|
if BASELINE_FILE.exists():
|
|
try:
|
|
baseline = json.loads(BASELINE_FILE.read_text())
|
|
except Exception:
|
|
pass
|
|
|
|
run_fn = run_ollama if backend == "ollama" else run_llama_server
|
|
results = []
|
|
pass_count = 0
|
|
fail_count = 0
|
|
|
|
print(f"Running {len(prompts)} prompts against {backend} ({model})...", file=sys.stderr)
|
|
|
|
for p in prompts:
|
|
print(f" [{p['id']}/10] {p['name']}...", file=sys.stderr, end=" ")
|
|
|
|
if backend == "ollama":
|
|
result = run_fn(p["prompt"], model, url, timeout)
|
|
else:
|
|
result = run_fn(p["prompt"], model, url, kv_type, timeout)
|
|
|
|
if "error" in result:
|
|
print(f"ERROR: {result['error']}", file=sys.stderr)
|
|
results.append({"prompt_id": p["id"], "name": p["name"], "error": result["error"]})
|
|
fail_count += 1
|
|
continue
|
|
|
|
quality = evaluate_quality(result["response"], p["pass_pattern"])
|
|
perf = evaluate_performance(result, baseline.get(str(p["id"]), {}))
|
|
|
|
quality_pass = quality["matched"] and quality["has_substance"]
|
|
perf_pass = perf.get("tok_per_sec_pass", True) and perf.get("ttft_pass", True)
|
|
overall_pass = quality_pass and perf_pass
|
|
|
|
if overall_pass:
|
|
pass_count += 1
|
|
print("PASS", file=sys.stderr)
|
|
else:
|
|
fail_count += 1
|
|
reasons = []
|
|
if not quality_pass:
|
|
reasons.append("quality")
|
|
if not perf_pass:
|
|
reasons.append("perf")
|
|
print(f"FAIL ({', '.join(reasons)})", file=sys.stderr)
|
|
|
|
results.append({
|
|
"prompt_id": p["id"],
|
|
"name": p["name"],
|
|
"category": p["category"],
|
|
"quality": quality,
|
|
"performance": perf,
|
|
"pass": overall_pass,
|
|
"response_preview": result["response"][:200],
|
|
})
|
|
|
|
report = {
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
"model": model,
|
|
"backend": backend,
|
|
"kv_type": kv_type,
|
|
"total_prompts": len(prompts),
|
|
"passed": pass_count,
|
|
"failed": fail_count,
|
|
"pass_rate": pass_count / len(prompts) if prompts else 0,
|
|
"quality_pass_rate": sum(1 for r in results if r.get("quality", {}).get("matched", False)) / len(prompts) if prompts else 0,
|
|
"results": results,
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def report_to_markdown(report: dict) -> str:
|
|
"""Generate markdown test report."""
|
|
lines = [
|
|
f"# TurboQuant Test Matrix Report",
|
|
"",
|
|
f"Generated: {report['generated_at'][:16]}",
|
|
f"Model: {report['model']}",
|
|
f"Backend: {report['backend']} (KV: {report.get('kv_type', 'fp16')})",
|
|
"",
|
|
"## Summary",
|
|
"",
|
|
"| Metric | Value |",
|
|
"|--------|-------|",
|
|
f"| Total prompts | {report['total_prompts']} |",
|
|
f"| Passed | {report['passed']} |",
|
|
f"| Failed | {report['failed']} |",
|
|
f"| Pass rate | {report['pass_rate']:.0%} |",
|
|
f"| Quality pass rate | {report['quality_pass_rate']:.0%} |",
|
|
"",
|
|
"## Results",
|
|
"",
|
|
"| # | Prompt | Category | Quality | Perf tok/s | Pass |",
|
|
"|---|--------|----------|---------|------------|------|",
|
|
]
|
|
|
|
for r in report["results"]:
|
|
if "error" in r:
|
|
lines.append(f"| {r['prompt_id']} | {r['name']} | - | ERROR | - | ❌ |")
|
|
continue
|
|
|
|
q = r.get("quality", {})
|
|
p = r.get("performance", {})
|
|
q_icon = "✅" if q.get("matched") else "❌"
|
|
p_toks = f"{p.get('tok_per_sec', 0):.1f}" if p.get("tok_per_sec") else "-"
|
|
pass_icon = "✅" if r.get("pass") else "❌"
|
|
lines.append(f"| {r['prompt_id']} | {r['name']} | {r.get('category', '')} | {q_icon} | {p_toks} | {pass_icon} |")
|
|
|
|
lines.extend([
|
|
"",
|
|
"## Pass Criteria",
|
|
"",
|
|
"| Test | Criteria |",
|
|
"|------|----------|",
|
|
f"| Pattern match | >= {PROMPT_QUALITY_MIN:.0%} of prompts match expected patterns |",
|
|
f"| tok/s | >= {TOKS_BASELINE_RATIO:.0%} of baseline |",
|
|
f"| TTFT | <= {TTFT_BASELINE_RATIO:.0%} of baseline |",
|
|
f"| Peak memory | < {MEMORY_CEILING_GB}GB |",
|
|
])
|
|
|
|
# Go/no-go
|
|
all_pass = report["pass_rate"] >= 0.9
|
|
lines.extend([
|
|
"",
|
|
"## Go/No-Go Decision",
|
|
"",
|
|
f"**{'GO ✅' if all_pass else 'NO-GO ❌'}** — {report['passed']}/{report['total_prompts']} prompts passed ({report['pass_rate']:.0%})",
|
|
])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="TurboQuant Full Test Matrix")
|
|
parser.add_argument("--model", default="llama3", help="Model name")
|
|
parser.add_argument("--backend", default="ollama", choices=["ollama", "llama-server"])
|
|
parser.add_argument("--url", default="http://localhost:11434", help="Backend URL")
|
|
parser.add_argument("--kv-type", default="fp16", help="KV cache type (fp16, turbo4, q4_0)")
|
|
parser.add_argument("--quick", action="store_true", help="Run only 3 prompts")
|
|
parser.add_argument("--json", action="store_true", help="JSON output")
|
|
parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout")
|
|
args = parser.parse_args()
|
|
|
|
report = run_test_matrix(args.model, args.backend, args.url, args.kv_type, args.quick, args.timeout)
|
|
|
|
# Save results
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
result_file = RESULTS_DIR / f"matrix_{args.model}_{args.kv_type}_{ts}.json"
|
|
result_file.write_text(json.dumps(report, indent=2) + "\n")
|
|
print(f"Results saved to {result_file}", file=sys.stderr)
|
|
|
|
if args.json:
|
|
print(json.dumps(report, indent=2))
|
|
else:
|
|
print(report_to_markdown(report))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|