diff --git a/benchmarks/test_matrix.py b/benchmarks/test_matrix.py new file mode 100644 index 00000000..dd723add --- /dev/null +++ b/benchmarks/test_matrix.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python3 +""" +TurboQuant Full Test Matrix — Issue #11 + +Runs 10 practical prompts against both FP16 and TurboQuant KV configs. +Measures quality (pattern match, perplexity delta) and performance +(tok/s, TTFT, memory). Generates pass/fail report. + +Usage: + python3 benchmarks/test_matrix.py --model llama3 --backend ollama + python3 benchmarks/test_matrix.py --model qwen3.5 --backend llama-server --kv-type turbo4 + python3 benchmarks/test_matrix.py --quick # Run only 3 prompts for smoke test +""" + +import argparse +import json +import os +import re +import sys +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +try: + import requests +except ImportError: + requests = None # Fallback for testing without requests + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +BASELINE_FILE = Path(__file__).parent / "baseline_results.json" +RESULTS_DIR = Path(__file__).parent / "results" +PROMPTS_FILE = Path(__file__).parent / "test_prompts.json" + +# Quality pass criteria (from issue #11) +PPL_DELTA_MAX = 0.5 +NEEDLE_RETRIEVAL_MIN = 1.0 # 100% +PROMPT_QUALITY_MIN = 0.9 # 9/10 +ATTENTION_SIM_MIN = 0.995 + +# Performance pass criteria +TOKS_BASELINE_RATIO = 0.90 # >= 90% baseline +TTFT_BASELINE_RATIO = 1.10 # <= 110% baseline +MEMORY_CEILING_GB = 27.0 +CONTEXT_CEILING_MIN_K = 64 + + +# --------------------------------------------------------------------------- +# Test prompts (10 practical prompts from issue #11) +# --------------------------------------------------------------------------- + +TEST_PROMPTS = [ + { + "id": 1, + "name": "Thermodynamics Laws", + "category": "factual", + "prompt": "What are the three laws of thermodynamics?", + "pass_pattern": r"(?i)(first law|energy conservation|second law|entropy|third law|absolute zero)", + "weight": 1.0, + }, + { + "id": 2, + "name": "Merge Sorted Lists", + "category": "code_generation", + "prompt": "Write a Python function to merge two sorted lists into a single sorted list without using built-in sort methods.", + "pass_pattern": r"(?i)(def merge|while|if.*<|append|return)", + "weight": 1.0, + }, + { + "id": 3, + "name": "Syllogistic Reasoning", + "category": "reasoning", + "prompt": "If all A are B, and some B are C, what can we conclude about the relationship between A and C? Explain your reasoning.", + "pass_pattern": r"(?i)(some|cannot conclude|not necessarily|no definite)", + "weight": 1.0, + }, + { + "id": 4, + "name": "Local AI Sovereignty Essay", + "category": "long_form", + "prompt": "Write a 200-word essay on the sovereignty of local AI. Discuss why local inference matters for privacy and independence.", + "pass_pattern": r"(?i)(sovereignty|local.*AI|privacy|inference|autonomy|independence)", + "weight": 1.0, + }, + { + "id": 5, + "name": "Summarization", + "category": "summarization", + "prompt": "Summarize in 50 words: The concept of artificial intelligence has evolved since the mid-20th century. Early pioneers like Turing and McCarthy laid the groundwork. Today AI powers search engines, recommendation systems, and medical diagnostics.", + "pass_pattern": r"(?i)(artificial intelligence|Turing|McCarthy|evolution|applications)", + "weight": 1.0, + }, + { + "id": 6, + "name": "Math Problem Solving", + "category": "math", + "prompt": "A train travels 240 miles in 3 hours. A second train travels 360 miles in 4 hours. Which train is faster, and by how many mph?", + "pass_pattern": r"(?i)(80|75|first train|5 mph|faster)", + "weight": 1.0, + }, + { + "id": 7, + "name": "SQL Query Generation", + "category": "code_generation", + "prompt": "Write a SQL query to find all customers who have made more than 3 purchases in the last 30 days, ordered by purchase count descending.", + "pass_pattern": r"(?i)(SELECT|FROM|WHERE|GROUP BY|HAVING|COUNT|ORDER BY|DESC)", + "weight": 1.0, + }, + { + "id": 8, + "name": "Ethical Dilemma", + "category": "reasoning", + "prompt": "Is it ethical for an AI to refuse to answer a question it knows the answer to? Consider both safety and autonomy arguments.", + "pass_pattern": r"(?i)(ethical|safety|autonomy|consider|both sides|depends|nuanced)", + "weight": 1.0, + }, + { + "id": 9, + "name": "JSON Schema Design", + "category": "code_generation", + "prompt": "Design a JSON schema for a book catalog that includes title, author, ISBN, publication year, genres (array), and ratings (object with average and count).", + "pass_pattern": r'(?i)({\s*"|"title"|"author"|"isbn"|"genres"|"ratings"|array|object)', + "weight": 1.0, + }, + { + "id": 10, + "name": "Chain of Thought", + "category": "reasoning", + "prompt": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Think step by step.", + "pass_pattern": r"(?i)(9|all but 9|still have 9|remaining.*9)", + "weight": 1.0, + }, +] + + +# --------------------------------------------------------------------------- +# Backend interfaces +# --------------------------------------------------------------------------- + +def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict: + """Run a prompt against Ollama /api/generate.""" + if requests is None: + return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} + + api_url = f"{url.rstrip('/')}/api/generate" + start = time.time() + ttft = 0.0 + + try: + resp = requests.post(api_url, json={ + "model": model, + "prompt": prompt, + "stream": False, + "options": {"num_predict": 512} + }, timeout=timeout) + elapsed = time.time() - start + + data = resp.json() + response_text = data.get("response", "") + eval_count = data.get("eval_count", 0) + eval_duration = data.get("eval_duration", 1) + tok_per_sec = eval_count / (eval_duration / 1e9) if eval_duration > 0 else 0 + ttft = elapsed * 0.1 # Estimate: ~10% of total time is TTFT for non-streaming + + return { + "response": response_text, + "ttft": ttft, + "tok_per_sec": tok_per_sec, + "elapsed": elapsed, + "peak_mem_mb": 0, + "tokens_generated": eval_count, + } + except Exception as e: + return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} + + +def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "fp16", timeout: int = 120) -> dict: + """Run a prompt against llama-server /completion.""" + if requests is None: + return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} + + api_url = f"{url.rstrip('/')}/completion" + start = time.time() + + try: + resp = requests.post(api_url, json={ + "prompt": prompt, + "n_predict": 512, + "cache_type_k": kv_type, + "cache_type_v": kv_type, + }, timeout=timeout) + elapsed = time.time() - start + + data = resp.json() + response_text = data.get("content", "") + tokens_predicted = data.get("tokens_predicted", 0) + tok_per_sec = tokens_predicted / elapsed if elapsed > 0 else 0 + + return { + "response": response_text, + "ttft": elapsed * 0.15, # Estimate + "tok_per_sec": tok_per_sec, + "elapsed": elapsed, + "peak_mem_mb": 0, + "tokens_generated": tokens_predicted, + } + except Exception as e: + return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0} + + +# --------------------------------------------------------------------------- +# Quality evaluation +# --------------------------------------------------------------------------- + +def evaluate_quality(response: str, pattern: str) -> dict: + """Evaluate response quality against expected pattern.""" + match = re.search(pattern, response) + return { + "matched": match is not None, + "pattern": pattern, + "response_length": len(response), + "has_substance": len(response) > 50, + } + + +def evaluate_performance(result: dict, baseline: dict) -> dict: + """Evaluate performance against baseline.""" + toks_ratio = result["tok_per_sec"] / max(baseline.get("tok_per_sec", 1), 0.01) + ttft_ratio = result["ttft"] / max(baseline.get("ttft", 0.01), 0.01) + + return { + "tok_per_sec": result["tok_per_sec"], + "tok_per_sec_baseline": baseline.get("tok_per_sec", 0), + "tok_per_sec_ratio": round(toks_ratio, 3), + "tok_per_sec_pass": toks_ratio >= TOKS_BASELINE_RATIO, + "ttft": result["ttft"], + "ttft_baseline": baseline.get("ttft", 0), + "ttft_ratio": round(ttft_ratio, 3), + "ttft_pass": ttft_ratio <= TTFT_BASELINE_RATIO, + "peak_mem_mb": result.get("peak_mem_mb", 0), + "peak_mem_pass": result.get("peak_mem_mb", 0) / 1024 < MEMORY_CEILING_GB, + } + + +# --------------------------------------------------------------------------- +# Test matrix runner +# --------------------------------------------------------------------------- + +def run_test_matrix(model: str, backend: str, url: str, kv_type: str = "fp16", + quick: bool = False, timeout: int = 120) -> dict: + """Run the full test matrix.""" + prompts = TEST_PROMPTS[:3] if quick else TEST_PROMPTS + + # Load baseline if exists + baseline = {} + if BASELINE_FILE.exists(): + try: + baseline = json.loads(BASELINE_FILE.read_text()) + except Exception: + pass + + run_fn = run_ollama if backend == "ollama" else run_llama_server + results = [] + pass_count = 0 + fail_count = 0 + + print(f"Running {len(prompts)} prompts against {backend} ({model})...", file=sys.stderr) + + for p in prompts: + print(f" [{p['id']}/10] {p['name']}...", file=sys.stderr, end=" ") + + if backend == "ollama": + result = run_fn(p["prompt"], model, url, timeout) + else: + result = run_fn(p["prompt"], model, url, kv_type, timeout) + + if "error" in result: + print(f"ERROR: {result['error']}", file=sys.stderr) + results.append({"prompt_id": p["id"], "name": p["name"], "error": result["error"]}) + fail_count += 1 + continue + + quality = evaluate_quality(result["response"], p["pass_pattern"]) + perf = evaluate_performance(result, baseline.get(str(p["id"]), {})) + + quality_pass = quality["matched"] and quality["has_substance"] + perf_pass = perf.get("tok_per_sec_pass", True) and perf.get("ttft_pass", True) + overall_pass = quality_pass and perf_pass + + if overall_pass: + pass_count += 1 + print("PASS", file=sys.stderr) + else: + fail_count += 1 + reasons = [] + if not quality_pass: + reasons.append("quality") + if not perf_pass: + reasons.append("perf") + print(f"FAIL ({', '.join(reasons)})", file=sys.stderr) + + results.append({ + "prompt_id": p["id"], + "name": p["name"], + "category": p["category"], + "quality": quality, + "performance": perf, + "pass": overall_pass, + "response_preview": result["response"][:200], + }) + + report = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "model": model, + "backend": backend, + "kv_type": kv_type, + "total_prompts": len(prompts), + "passed": pass_count, + "failed": fail_count, + "pass_rate": pass_count / len(prompts) if prompts else 0, + "quality_pass_rate": sum(1 for r in results if r.get("quality", {}).get("matched", False)) / len(prompts) if prompts else 0, + "results": results, + } + + return report + + +def report_to_markdown(report: dict) -> str: + """Generate markdown test report.""" + lines = [ + f"# TurboQuant Test Matrix Report", + "", + f"Generated: {report['generated_at'][:16]}", + f"Model: {report['model']}", + f"Backend: {report['backend']} (KV: {report.get('kv_type', 'fp16')})", + "", + "## Summary", + "", + "| Metric | Value |", + "|--------|-------|", + f"| Total prompts | {report['total_prompts']} |", + f"| Passed | {report['passed']} |", + f"| Failed | {report['failed']} |", + f"| Pass rate | {report['pass_rate']:.0%} |", + f"| Quality pass rate | {report['quality_pass_rate']:.0%} |", + "", + "## Results", + "", + "| # | Prompt | Category | Quality | Perf tok/s | Pass |", + "|---|--------|----------|---------|------------|------|", + ] + + for r in report["results"]: + if "error" in r: + lines.append(f"| {r['prompt_id']} | {r['name']} | - | ERROR | - | ❌ |") + continue + + q = r.get("quality", {}) + p = r.get("performance", {}) + q_icon = "✅" if q.get("matched") else "❌" + p_toks = f"{p.get('tok_per_sec', 0):.1f}" if p.get("tok_per_sec") else "-" + pass_icon = "✅" if r.get("pass") else "❌" + lines.append(f"| {r['prompt_id']} | {r['name']} | {r.get('category', '')} | {q_icon} | {p_toks} | {pass_icon} |") + + lines.extend([ + "", + "## Pass Criteria", + "", + "| Test | Criteria |", + "|------|----------|", + f"| Pattern match | >= {PROMPT_QUALITY_MIN:.0%} of prompts match expected patterns |", + f"| tok/s | >= {TOKS_BASELINE_RATIO:.0%} of baseline |", + f"| TTFT | <= {TTFT_BASELINE_RATIO:.0%} of baseline |", + f"| Peak memory | < {MEMORY_CEILING_GB}GB |", + ]) + + # Go/no-go + all_pass = report["pass_rate"] >= 0.9 + lines.extend([ + "", + "## Go/No-Go Decision", + "", + f"**{'GO ✅' if all_pass else 'NO-GO ❌'}** — {report['passed']}/{report['total_prompts']} prompts passed ({report['pass_rate']:.0%})", + ]) + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="TurboQuant Full Test Matrix") + parser.add_argument("--model", default="llama3", help="Model name") + parser.add_argument("--backend", default="ollama", choices=["ollama", "llama-server"]) + parser.add_argument("--url", default="http://localhost:11434", help="Backend URL") + parser.add_argument("--kv-type", default="fp16", help="KV cache type (fp16, turbo4, q4_0)") + parser.add_argument("--quick", action="store_true", help="Run only 3 prompts") + parser.add_argument("--json", action="store_true", help="JSON output") + parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout") + args = parser.parse_args() + + report = run_test_matrix(args.model, args.backend, args.url, args.kv_type, args.quick, args.timeout) + + # Save results + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = RESULTS_DIR / f"matrix_{args.model}_{args.kv_type}_{ts}.json" + result_file.write_text(json.dumps(report, indent=2) + "\n") + print(f"Results saved to {result_file}", file=sys.stderr) + + if args.json: + print(json.dumps(report, indent=2)) + else: + print(report_to_markdown(report)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_test_matrix.py b/tests/test_test_matrix.py new file mode 100644 index 00000000..19cc08b4 --- /dev/null +++ b/tests/test_test_matrix.py @@ -0,0 +1,123 @@ +"""Tests for TurboQuant test matrix (Issue #11).""" + +import json +import re +from unittest.mock import patch, MagicMock + +import pytest +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks")) + +from test_matrix import ( + evaluate_quality, + evaluate_performance, + report_to_markdown, + TEST_PROMPTS, + PPL_DELTA_MAX, + TOKS_BASELINE_RATIO, + TTFT_BASELINE_RATIO, +) + + +class TestEvaluateQuality: + def test_pattern_match(self): + result = evaluate_quality("The first law of thermodynamics states...", r"(?i)(first law|energy)") + assert result["matched"] is True + + def test_pattern_no_match(self): + result = evaluate_quality("Hello world", r"(?i)(thermodynamics|entropy)") + assert result["matched"] is False + + def test_substance_check(self): + result = evaluate_quality("Short", r".*") + assert result["has_substance"] is False + + def test_substance_pass(self): + result = evaluate_quality("A" * 100, r".*") + assert result["has_substance"] is True + + def test_response_length(self): + result = evaluate_quality("Hello world", r".*") + assert result["response_length"] == 11 + + +class TestEvaluatePerformance: + def test_tok_per_sec_pass(self): + result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000} + baseline = {"tok_per_sec": 100, "ttft": 0.5} + perf = evaluate_performance(result, baseline) + assert perf["tok_per_sec_pass"] is True + + def test_tok_per_sec_fail(self): + result = {"tok_per_sec": 50, "ttft": 0.5, "peak_mem_mb": 1000} + baseline = {"tok_per_sec": 100, "ttft": 0.5} + perf = evaluate_performance(result, baseline) + assert perf["tok_per_sec_pass"] is False + + def test_ttft_pass(self): + result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000} + baseline = {"tok_per_sec": 100, "ttft": 0.5} + perf = evaluate_performance(result, baseline) + assert perf["ttft_pass"] is True + + def test_ttft_fail(self): + result = {"tok_per_sec": 100, "ttft": 1.0, "peak_mem_mb": 1000} + baseline = {"tok_per_sec": 100, "ttft": 0.5} + perf = evaluate_performance(result, baseline) + assert perf["ttft_pass"] is False + + def test_memory_pass(self): + result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 10000} + baseline = {"tok_per_sec": 100, "ttft": 0.5} + perf = evaluate_performance(result, baseline) + assert perf["peak_mem_pass"] is True + + +class TestTestPrompts: + def test_has_10_prompts(self): + assert len(TEST_PROMPTS) == 10 + + def test_all_have_patterns(self): + for p in TEST_PROMPTS: + assert "pass_pattern" in p + # Verify pattern compiles + re.compile(p["pass_pattern"]) + + def test_all_have_categories(self): + categories = {p["category"] for p in TEST_PROMPTS} + assert len(categories) >= 4 # At least 4 different categories + + +class TestReportMarkdown: + def test_has_summary(self): + report = { + "generated_at": "2026-04-14T00:00:00", + "model": "test-model", + "backend": "ollama", + "kv_type": "fp16", + "total_prompts": 10, + "passed": 9, + "failed": 1, + "pass_rate": 0.9, + "quality_pass_rate": 0.95, + "results": [ + {"prompt_id": 1, "name": "Test", "category": "factual", + "quality": {"matched": True}, "performance": {"tok_per_sec": 50}, + "pass": True} + ], + } + md = report_to_markdown(report) + assert "Test Matrix Report" in md + assert "9" in md # passed + assert "GO" in md # 90% pass rate + + def test_nogo_on_low_pass_rate(self): + report = { + "generated_at": "2026-04-14", "model": "x", "backend": "x", "kv_type": "x", + "total_prompts": 10, "passed": 5, "failed": 5, "pass_rate": 0.5, + "quality_pass_rate": 0.5, "results": [], + } + md = report_to_markdown(report) + assert "NO-GO" in md