#!/usr/bin/env python3 """ TurboQuant Full Test Matrix — Issue #11 Runs 10 practical prompts against both FP16 and TurboQuant KV configs. Measures quality (pattern match, perplexity delta) and performance (tok/s, TTFT, memory). Generates pass/fail report. Usage: python3 benchmarks/test_matrix.py --model llama3 --backend ollama python3 benchmarks/test_matrix.py --model qwen3.5 --backend llama-server --kv-type turbo4 python3 benchmarks/test_matrix.py --quick # Run only 3 prompts for smoke test """ import argparse import json import os import re import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional try: import requests except ImportError: requests = None # Fallback for testing without requests # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- BASELINE_FILE = Path(__file__).parent / "baseline_results.json" RESULTS_DIR = Path(__file__).parent / "results" PROMPTS_FILE = Path(__file__).parent / "test_prompts.json" # Quality pass criteria (from issue #11) PPL_DELTA_MAX = 0.5 NEEDLE_RETRIEVAL_MIN = 1.0 # 100% PROMPT_QUALITY_MIN = 0.9 # 9/10 ATTENTION_SIM_MIN = 0.995 # Performance pass criteria TOKS_BASELINE_RATIO = 0.90 # >= 90% baseline TTFT_BASELINE_RATIO = 1.10 # <= 110% baseline MEMORY_CEILING_GB = 27.0 CONTEXT_CEILING_MIN_K = 64 # --------------------------------------------------------------------------- # Test prompts (10 practical prompts from issue #11) # --------------------------------------------------------------------------- TEST_PROMPTS = [ { "id": 1, "name": "Thermodynamics Laws", "category": "factual", "prompt": "What are the three laws of thermodynamics?", "pass_pattern": r"(?i)(first law|energy conservation|second law|entropy|third law|absolute zero)", "weight": 1.0, }, { "id": 2, "name": "Merge Sorted Lists", "category": "code_generation", "prompt": "Write a Python function to merge two sorted lists into a single sorted list without using built-in sort methods.", "pass_pattern": r"(?i)(def merge|while|if.*<|append|return)", "weight": 1.0, }, { "id": 3, "name": "Syllogistic Reasoning", "category": "reasoning", "prompt": "If all A are B, and some B are C, what can we conclude about the relationship between A and C? Explain your reasoning.", "pass_pattern": r"(?i)(some|cannot conclude|not necessarily|no definite)", "weight": 1.0, }, { "id": 4, "name": "Local AI Sovereignty Essay", "category": "long_form", "prompt": "Write a 200-word essay on the sovereignty of local AI. Discuss why local inference matters for privacy and independence.", "pass_pattern": r"(?i)(sovereignty|local.*AI|privacy|inference|autonomy|independence)", "weight": 1.0, }, { "id": 5, "name": "Summarization", "category": "summarization", "prompt": "Summarize in 50 words: The concept of artificial intelligence has evolved since the mid-20th century. Early pioneers like Turing and McCarthy laid the groundwork. Today AI powers search engines, recommendation systems, and medical diagnostics.", "pass_pattern": r"(?i)(artificial intelligence|Turing|McCarthy|evolution|applications)", "weight": 1.0, }, { "id": 6, "name": "Math Problem Solving", "category": "math", "prompt": "A train travels 240 miles in 3 hours. A second train travels 360 miles in 4 hours. Which train is faster, and by how many mph?", "pass_pattern": r"(?i)(80|75|first train|5 mph|faster)", "weight": 1.0, }, { "id": 7, "name": "SQL Query Generation", "category": "code_generation", "prompt": "Write a SQL query to find all customers who have made more than 3 purchases in the last 30 days, ordered by purchase count descending.", "pass_pattern": r"(?i)(SELECT|FROM|WHERE|GROUP BY|HAVING|COUNT|ORDER BY|DESC)", "weight": 1.0, }, { "id": 8, "name": "Ethical Dilemma", "category": "reasoning", "prompt": "Is it ethical for an AI to refuse to answer a question it knows the answer to? Consider both safety and autonomy arguments.", "pass_pattern": r"(?i)(ethical|safety|autonomy|consider|both sides|depends|nuanced)", "weight": 1.0, }, { "id": 9, "name": "JSON Schema Design", "category": "code_generation", "prompt": "Design a JSON schema for a book catalog that includes title, author, ISBN, publication year, genres (array), and ratings (object with average and count).", "pass_pattern": r'(?i)({\s*"|"title"|"author"|"isbn"|"genres"|"ratings"|array|object)', "weight": 1.0, }, { "id": 10, "name": "Chain of Thought", "category": "reasoning", "prompt": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Think step by step.", "pass_pattern": r"(?i)(9|all but 9|still have 9|remaining.*9)", "weight": 1.0, }, ] # --------------------------------------------------------------------------- # Backend interfaces # --------------------------------------------------------------------------- def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict: """Run a prompt against Ollama /api/generate.""" if requests is None: return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} api_url = f"{url.rstrip('/')}/api/generate" start = time.time() ttft = 0.0 try: resp = requests.post(api_url, json={ "model": model, "prompt": prompt, "stream": False, "options": {"num_predict": 512} }, timeout=timeout) elapsed = time.time() - start data = resp.json() response_text = data.get("response", "") eval_count = data.get("eval_count", 0) eval_duration = data.get("eval_duration", 1) tok_per_sec = eval_count / (eval_duration / 1e9) if eval_duration > 0 else 0 ttft = elapsed * 0.1 # Estimate: ~10% of total time is TTFT for non-streaming return { "response": response_text, "ttft": ttft, "tok_per_sec": tok_per_sec, "elapsed": elapsed, "peak_mem_mb": 0, "tokens_generated": eval_count, } except Exception as e: return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "fp16", timeout: int = 120) -> dict: """Run a prompt against llama-server /completion.""" if requests is None: return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} api_url = f"{url.rstrip('/')}/completion" start = time.time() try: resp = requests.post(api_url, json={ "prompt": prompt, "n_predict": 512, "cache_type_k": kv_type, "cache_type_v": kv_type, }, timeout=timeout) elapsed = time.time() - start data = resp.json() response_text = data.get("content", "") tokens_predicted = data.get("tokens_predicted", 0) tok_per_sec = tokens_predicted / elapsed if elapsed > 0 else 0 return { "response": response_text, "ttft": elapsed * 0.15, # Estimate "tok_per_sec": tok_per_sec, "elapsed": elapsed, "peak_mem_mb": 0, "tokens_generated": tokens_predicted, } except Exception as e: return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} # --------------------------------------------------------------------------- # Quality evaluation # --------------------------------------------------------------------------- def evaluate_quality(response: str, pattern: str) -> dict: """Evaluate response quality against expected pattern.""" match = re.search(pattern, response) return { "matched": match is not None, "pattern": pattern, "response_length": len(response), "has_substance": len(response) > 50, } def evaluate_performance(result: dict, baseline: dict) -> dict: """Evaluate performance against baseline.""" toks_ratio = result["tok_per_sec"] / max(baseline.get("tok_per_sec", 1), 0.01) ttft_ratio = result["ttft"] / max(baseline.get("ttft", 0.01), 0.01) return { "tok_per_sec": result["tok_per_sec"], "tok_per_sec_baseline": baseline.get("tok_per_sec", 0), "tok_per_sec_ratio": round(toks_ratio, 3), "tok_per_sec_pass": toks_ratio >= TOKS_BASELINE_RATIO, "ttft": result["ttft"], "ttft_baseline": baseline.get("ttft", 0), "ttft_ratio": round(ttft_ratio, 3), "ttft_pass": ttft_ratio <= TTFT_BASELINE_RATIO, "peak_mem_mb": result.get("peak_mem_mb", 0), "peak_mem_pass": result.get("peak_mem_mb", 0) / 1024 < MEMORY_CEILING_GB, } # --------------------------------------------------------------------------- # Test matrix runner # --------------------------------------------------------------------------- def run_test_matrix(model: str, backend: str, url: str, kv_type: str = "fp16", quick: bool = False, timeout: int = 120) -> dict: """Run the full test matrix.""" prompts = TEST_PROMPTS[:3] if quick else TEST_PROMPTS # Load baseline if exists baseline = {} if BASELINE_FILE.exists(): try: baseline = json.loads(BASELINE_FILE.read_text()) except Exception: pass run_fn = run_ollama if backend == "ollama" else run_llama_server results = [] pass_count = 0 fail_count = 0 print(f"Running {len(prompts)} prompts against {backend} ({model})...", file=sys.stderr) for p in prompts: print(f" [{p['id']}/10] {p['name']}...", file=sys.stderr, end=" ") if backend == "ollama": result = run_fn(p["prompt"], model, url, timeout) else: result = run_fn(p["prompt"], model, url, kv_type, timeout) if "error" in result: print(f"ERROR: {result['error']}", file=sys.stderr) results.append({"prompt_id": p["id"], "name": p["name"], "error": result["error"]}) fail_count += 1 continue quality = evaluate_quality(result["response"], p["pass_pattern"]) perf = evaluate_performance(result, baseline.get(str(p["id"]), {})) quality_pass = quality["matched"] and quality["has_substance"] perf_pass = perf.get("tok_per_sec_pass", True) and perf.get("ttft_pass", True) overall_pass = quality_pass and perf_pass if overall_pass: pass_count += 1 print("PASS", file=sys.stderr) else: fail_count += 1 reasons = [] if not quality_pass: reasons.append("quality") if not perf_pass: reasons.append("perf") print(f"FAIL ({', '.join(reasons)})", file=sys.stderr) results.append({ "prompt_id": p["id"], "name": p["name"], "category": p["category"], "quality": quality, "performance": perf, "pass": overall_pass, "response_preview": result["response"][:200], }) report = { "generated_at": datetime.now(timezone.utc).isoformat(), "model": model, "backend": backend, "kv_type": kv_type, "total_prompts": len(prompts), "passed": pass_count, "failed": fail_count, "pass_rate": pass_count / len(prompts) if prompts else 0, "quality_pass_rate": sum(1 for r in results if r.get("quality", {}).get("matched", False)) / len(prompts) if prompts else 0, "results": results, } return report def report_to_markdown(report: dict) -> str: """Generate markdown test report.""" lines = [ f"# TurboQuant Test Matrix Report", "", f"Generated: {report['generated_at'][:16]}", f"Model: {report['model']}", f"Backend: {report['backend']} (KV: {report.get('kv_type', 'fp16')})", "", "## Summary", "", "| Metric | Value |", "|--------|-------|", f"| Total prompts | {report['total_prompts']} |", f"| Passed | {report['passed']} |", f"| Failed | {report['failed']} |", f"| Pass rate | {report['pass_rate']:.0%} |", f"| Quality pass rate | {report['quality_pass_rate']:.0%} |", "", "## Results", "", "| # | Prompt | Category | Quality | Perf tok/s | Pass |", "|---|--------|----------|---------|------------|------|", ] for r in report["results"]: if "error" in r: lines.append(f"| {r['prompt_id']} | {r['name']} | - | ERROR | - | ❌ |") continue q = r.get("quality", {}) p = r.get("performance", {}) q_icon = "✅" if q.get("matched") else "❌" p_toks = f"{p.get('tok_per_sec', 0):.1f}" if p.get("tok_per_sec") else "-" pass_icon = "✅" if r.get("pass") else "❌" lines.append(f"| {r['prompt_id']} | {r['name']} | {r.get('category', '')} | {q_icon} | {p_toks} | {pass_icon} |") lines.extend([ "", "## Pass Criteria", "", "| Test | Criteria |", "|------|----------|", f"| Pattern match | >= {PROMPT_QUALITY_MIN:.0%} of prompts match expected patterns |", f"| tok/s | >= {TOKS_BASELINE_RATIO:.0%} of baseline |", f"| TTFT | <= {TTFT_BASELINE_RATIO:.0%} of baseline |", f"| Peak memory | < {MEMORY_CEILING_GB}GB |", ]) # Go/no-go all_pass = report["pass_rate"] >= 0.9 lines.extend([ "", "## Go/No-Go Decision", "", f"**{'GO ✅' if all_pass else 'NO-GO ❌'}** — {report['passed']}/{report['total_prompts']} prompts passed ({report['pass_rate']:.0%})", ]) return "\n".join(lines) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="TurboQuant Full Test Matrix") parser.add_argument("--model", default="llama3", help="Model name") parser.add_argument("--backend", default="ollama", choices=["ollama", "llama-server"]) parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") parser.add_argument("--kv-type", default="fp16", help="KV cache type (fp16, turbo4, q4_0)") parser.add_argument("--quick", action="store_true", help="Run only 3 prompts") parser.add_argument("--json", action="store_true", help="JSON output") parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout") args = parser.parse_args() report = run_test_matrix(args.model, args.backend, args.url, args.kv_type, args.quick, args.timeout) # Save results RESULTS_DIR.mkdir(parents=True, exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") result_file = RESULTS_DIR / f"matrix_{args.model}_{args.kv_type}_{ts}.json" result_file.write_text(json.dumps(report, indent=2) + "\n") print(f"Results saved to {result_file}", file=sys.stderr) if args.json: print(json.dumps(report, indent=2)) else: print(report_to_markdown(report)) if __name__ == "__main__": main()