turboquant/benchmarks/test_matrix.py

#!/usr/bin/env python3
"""
TurboQuant Full Test Matrix — Issue #11

Runs 10 practical prompts against both FP16 and TurboQuant KV configs.
Measures quality (pattern match, perplexity delta) and performance
(tok/s, TTFT, memory). Generates pass/fail report.

Usage:
    python3 benchmarks/test_matrix.py --model llama3 --backend ollama
    python3 benchmarks/test_matrix.py --model qwen3.5 --backend llama-server --kv-type turbo4
    python3 benchmarks/test_matrix.py --quick  # Run only 3 prompts for smoke test
"""

import argparse
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional

try:
    import requests
except ImportError:
    requests = None  # Fallback for testing without requests

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

BASELINE_FILE = Path(__file__).parent / "baseline_results.json"
RESULTS_DIR = Path(__file__).parent / "results"
PROMPTS_FILE = Path(__file__).parent / "test_prompts.json"

# Quality pass criteria (from issue #11)
PPL_DELTA_MAX = 0.5
NEEDLE_RETRIEVAL_MIN = 1.0  # 100%
PROMPT_QUALITY_MIN = 0.9    # 9/10
ATTENTION_SIM_MIN = 0.995

# Performance pass criteria
TOKS_BASELINE_RATIO = 0.90  # >= 90% baseline
TTFT_BASELINE_RATIO = 1.10  # <= 110% baseline
MEMORY_CEILING_GB = 27.0
CONTEXT_CEILING_MIN_K = 64


# ---------------------------------------------------------------------------
# Test prompts (10 practical prompts from issue #11)
# ---------------------------------------------------------------------------

TEST_PROMPTS = [
    {
        "id": 1,
        "name": "Thermodynamics Laws",
        "category": "factual",
        "prompt": "What are the three laws of thermodynamics?",
        "pass_pattern": r"(?i)(first law|energy conservation|second law|entropy|third law|absolute zero)",
        "weight": 1.0,
    },
    {
        "id": 2,
        "name": "Merge Sorted Lists",
        "category": "code_generation",
        "prompt": "Write a Python function to merge two sorted lists into a single sorted list without using built-in sort methods.",
        "pass_pattern": r"(?i)(def merge|while|if.*<|append|return)",
        "weight": 1.0,
    },
    {
        "id": 3,
        "name": "Syllogistic Reasoning",
        "category": "reasoning",
        "prompt": "If all A are B, and some B are C, what can we conclude about the relationship between A and C? Explain your reasoning.",
        "pass_pattern": r"(?i)(some|cannot conclude|not necessarily|no definite)",
        "weight": 1.0,
    },
    {
        "id": 4,
        "name": "Local AI Sovereignty Essay",
        "category": "long_form",
        "prompt": "Write a 200-word essay on the sovereignty of local AI. Discuss why local inference matters for privacy and independence.",
        "pass_pattern": r"(?i)(sovereignty|local.*AI|privacy|inference|autonomy|independence)",
        "weight": 1.0,
    },
    {
        "id": 5,
        "name": "Summarization",
        "category": "summarization",
        "prompt": "Summarize in 50 words: The concept of artificial intelligence has evolved since the mid-20th century. Early pioneers like Turing and McCarthy laid the groundwork. Today AI powers search engines, recommendation systems, and medical diagnostics.",
        "pass_pattern": r"(?i)(artificial intelligence|Turing|McCarthy|evolution|applications)",
        "weight": 1.0,
    },
    {
        "id": 6,
        "name": "Math Problem Solving",
        "category": "math",
        "prompt": "A train travels 240 miles in 3 hours. A second train travels 360 miles in 4 hours. Which train is faster, and by how many mph?",
        "pass_pattern": r"(?i)(80|75|first train|5 mph|faster)",
        "weight": 1.0,
    },
    {
        "id": 7,
        "name": "SQL Query Generation",
        "category": "code_generation",
        "prompt": "Write a SQL query to find all customers who have made more than 3 purchases in the last 30 days, ordered by purchase count descending.",
        "pass_pattern": r"(?i)(SELECT|FROM|WHERE|GROUP BY|HAVING|COUNT|ORDER BY|DESC)",
        "weight": 1.0,
    },
    {
        "id": 8,
        "name": "Ethical Dilemma",
        "category": "reasoning",
        "prompt": "Is it ethical for an AI to refuse to answer a question it knows the answer to? Consider both safety and autonomy arguments.",
        "pass_pattern": r"(?i)(ethical|safety|autonomy|consider|both sides|depends|nuanced)",
        "weight": 1.0,
    },
    {
        "id": 9,
        "name": "JSON Schema Design",
        "category": "code_generation",
        "prompt": "Design a JSON schema for a book catalog that includes title, author, ISBN, publication year, genres (array), and ratings (object with average and count).",
        "pass_pattern": r'(?i)({\s*"|"title"|"author"|"isbn"|"genres"|"ratings"|array|object)',
        "weight": 1.0,
    },
    {
        "id": 10,
        "name": "Chain of Thought",
        "category": "reasoning",
        "prompt": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Think step by step.",
        "pass_pattern": r"(?i)(9|all but 9|still have 9|remaining.*9)",
        "weight": 1.0,
    },
]


# ---------------------------------------------------------------------------
# Backend interfaces
# ---------------------------------------------------------------------------

def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
    """Run a prompt against Ollama /api/generate."""
    if requests is None:
        return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}

    api_url = f"{url.rstrip('/')}/api/generate"
    start = time.time()
    ttft = 0.0

    try:
        resp = requests.post(api_url, json={
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {"num_predict": 512}
        }, timeout=timeout)
        elapsed = time.time() - start

        data = resp.json()
        response_text = data.get("response", "")
        eval_count = data.get("eval_count", 0)
        eval_duration = data.get("eval_duration", 1)
        tok_per_sec = eval_count / (eval_duration / 1e9) if eval_duration > 0 else 0
        ttft = elapsed * 0.1  # Estimate: ~10% of total time is TTFT for non-streaming

        return {
            "response": response_text,
            "ttft": ttft,
            "tok_per_sec": tok_per_sec,
            "elapsed": elapsed,
            "peak_mem_mb": 0,
            "tokens_generated": eval_count,
        }
    except Exception as e:
        return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}


def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "fp16", timeout: int = 120) -> dict:
    """Run a prompt against llama-server /completion."""
    if requests is None:
        return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}

    api_url = f"{url.rstrip('/')}/completion"
    start = time.time()

    try:
        resp = requests.post(api_url, json={
            "prompt": prompt,
            "n_predict": 512,
            "cache_type_k": kv_type,
            "cache_type_v": kv_type,
        }, timeout=timeout)
        elapsed = time.time() - start

        data = resp.json()
        response_text = data.get("content", "")
        tokens_predicted = data.get("tokens_predicted", 0)
        tok_per_sec = tokens_predicted / elapsed if elapsed > 0 else 0

        return {
            "response": response_text,
            "ttft": elapsed * 0.15,  # Estimate
            "tok_per_sec": tok_per_sec,
            "elapsed": elapsed,
            "peak_mem_mb": 0,
            "tokens_generated": tokens_predicted,
        }
    except Exception as e:
        return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}


# ---------------------------------------------------------------------------
# Quality evaluation
# ---------------------------------------------------------------------------

def evaluate_quality(response: str, pattern: str) -> dict:
    """Evaluate response quality against expected pattern."""
    match = re.search(pattern, response)
    return {
        "matched": match is not None,
        "pattern": pattern,
        "response_length": len(response),
        "has_substance": len(response) > 50,
    }


def evaluate_performance(result: dict, baseline: dict) -> dict:
    """Evaluate performance against baseline."""
    toks_ratio = result["tok_per_sec"] / max(baseline.get("tok_per_sec", 1), 0.01)
    ttft_ratio = result["ttft"] / max(baseline.get("ttft", 0.01), 0.01)

    return {
        "tok_per_sec": result["tok_per_sec"],
        "tok_per_sec_baseline": baseline.get("tok_per_sec", 0),
        "tok_per_sec_ratio": round(toks_ratio, 3),
        "tok_per_sec_pass": toks_ratio >= TOKS_BASELINE_RATIO,
        "ttft": result["ttft"],
        "ttft_baseline": baseline.get("ttft", 0),
        "ttft_ratio": round(ttft_ratio, 3),
        "ttft_pass": ttft_ratio <= TTFT_BASELINE_RATIO,
        "peak_mem_mb": result.get("peak_mem_mb", 0),
        "peak_mem_pass": result.get("peak_mem_mb", 0) / 1024 < MEMORY_CEILING_GB,
    }


# ---------------------------------------------------------------------------
# Test matrix runner
# ---------------------------------------------------------------------------

def run_test_matrix(model: str, backend: str, url: str, kv_type: str = "fp16",
                    quick: bool = False, timeout: int = 120) -> dict:
    """Run the full test matrix."""
    prompts = TEST_PROMPTS[:3] if quick else TEST_PROMPTS

    # Load baseline if exists
    baseline = {}
    if BASELINE_FILE.exists():
        try:
            baseline = json.loads(BASELINE_FILE.read_text())
        except Exception:
            pass

    run_fn = run_ollama if backend == "ollama" else run_llama_server
    results = []
    pass_count = 0
    fail_count = 0

    print(f"Running {len(prompts)} prompts against {backend} ({model})...", file=sys.stderr)

    for p in prompts:
        print(f"  [{p['id']}/10] {p['name']}...", file=sys.stderr, end=" ")

        if backend == "ollama":
            result = run_fn(p["prompt"], model, url, timeout)
        else:
            result = run_fn(p["prompt"], model, url, kv_type, timeout)

        if "error" in result:
            print(f"ERROR: {result['error']}", file=sys.stderr)
            results.append({"prompt_id": p["id"], "name": p["name"], "error": result["error"]})
            fail_count += 1
            continue

        quality = evaluate_quality(result["response"], p["pass_pattern"])
        perf = evaluate_performance(result, baseline.get(str(p["id"]), {}))

        quality_pass = quality["matched"] and quality["has_substance"]
        perf_pass = perf.get("tok_per_sec_pass", True) and perf.get("ttft_pass", True)
        overall_pass = quality_pass and perf_pass

        if overall_pass:
            pass_count += 1
            print("PASS", file=sys.stderr)
        else:
            fail_count += 1
            reasons = []
            if not quality_pass:
                reasons.append("quality")
            if not perf_pass:
                reasons.append("perf")
            print(f"FAIL ({', '.join(reasons)})", file=sys.stderr)

        results.append({
            "prompt_id": p["id"],
            "name": p["name"],
            "category": p["category"],
            "quality": quality,
            "performance": perf,
            "pass": overall_pass,
            "response_preview": result["response"][:200],
        })

    report = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "model": model,
        "backend": backend,
        "kv_type": kv_type,
        "total_prompts": len(prompts),
        "passed": pass_count,
        "failed": fail_count,
        "pass_rate": pass_count / len(prompts) if prompts else 0,
        "quality_pass_rate": sum(1 for r in results if r.get("quality", {}).get("matched", False)) / len(prompts) if prompts else 0,
        "results": results,
    }

    return report


def report_to_markdown(report: dict) -> str:
    """Generate markdown test report."""
    lines = [
        f"# TurboQuant Test Matrix Report",
        "",
        f"Generated: {report['generated_at'][:16]}",
        f"Model: {report['model']}",
        f"Backend: {report['backend']} (KV: {report.get('kv_type', 'fp16')})",
        "",
        "## Summary",
        "",
        "| Metric | Value |",
        "|--------|-------|",
        f"| Total prompts | {report['total_prompts']} |",
        f"| Passed | {report['passed']} |",
        f"| Failed | {report['failed']} |",
        f"| Pass rate | {report['pass_rate']:.0%} |",
        f"| Quality pass rate | {report['quality_pass_rate']:.0%} |",
        "",
        "## Results",
        "",
        "| # | Prompt | Category | Quality | Perf tok/s | Pass |",
        "|---|--------|----------|---------|------------|------|",
    ]

    for r in report["results"]:
        if "error" in r:
            lines.append(f"| {r['prompt_id']} | {r['name']} | - | ERROR | - | ❌ |")
            continue

        q = r.get("quality", {})
        p = r.get("performance", {})
        q_icon = "✅" if q.get("matched") else "❌"
        p_toks = f"{p.get('tok_per_sec', 0):.1f}" if p.get("tok_per_sec") else "-"
        pass_icon = "✅" if r.get("pass") else "❌"
        lines.append(f"| {r['prompt_id']} | {r['name']} | {r.get('category', '')} | {q_icon} | {p_toks} | {pass_icon} |")

    lines.extend([
        "",
        "## Pass Criteria",
        "",
        "| Test | Criteria |",
        "|------|----------|",
        f"| Pattern match | >= {PROMPT_QUALITY_MIN:.0%} of prompts match expected patterns |",
        f"| tok/s | >= {TOKS_BASELINE_RATIO:.0%} of baseline |",
        f"| TTFT | <= {TTFT_BASELINE_RATIO:.0%} of baseline |",
        f"| Peak memory | < {MEMORY_CEILING_GB}GB |",
    ])

    # Go/no-go
    all_pass = report["pass_rate"] >= 0.9
    lines.extend([
        "",
        "## Go/No-Go Decision",
        "",
        f"**{'GO ✅' if all_pass else 'NO-GO ❌'}** — {report['passed']}/{report['total_prompts']} prompts passed ({report['pass_rate']:.0%})",
    ])

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="TurboQuant Full Test Matrix")
    parser.add_argument("--model", default="llama3", help="Model name")
    parser.add_argument("--backend", default="ollama", choices=["ollama", "llama-server"])
    parser.add_argument("--url", default="http://localhost:11434", help="Backend URL")
    parser.add_argument("--kv-type", default="fp16", help="KV cache type (fp16, turbo4, q4_0)")
    parser.add_argument("--quick", action="store_true", help="Run only 3 prompts")
    parser.add_argument("--json", action="store_true", help="JSON output")
    parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout")
    args = parser.parse_args()

    report = run_test_matrix(args.model, args.backend, args.url, args.kv_type, args.quick, args.timeout)

    # Save results
    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    result_file = RESULTS_DIR / f"matrix_{args.model}_{args.kv_type}_{ts}.json"
    result_file.write_text(json.dumps(report, indent=2) + "\n")
    print(f"Results saved to {result_file}", file=sys.stderr)

    if args.json:
        print(json.dumps(report, indent=2))
    else:
        print(report_to_markdown(report))


if __name__ == "__main__":
    main()