Compare commits
1 Commits
step35/67-
...
burn/11-17
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b31cd93148 |
423
benchmarks/test_matrix.py
Normal file
423
benchmarks/test_matrix.py
Normal file
@@ -0,0 +1,423 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TurboQuant Full Test Matrix — Issue #11
|
||||
|
||||
Runs 10 practical prompts against both FP16 and TurboQuant KV configs.
|
||||
Measures quality (pattern match, perplexity delta) and performance
|
||||
(tok/s, TTFT, memory). Generates pass/fail report.
|
||||
|
||||
Usage:
|
||||
python3 benchmarks/test_matrix.py --model llama3 --backend ollama
|
||||
python3 benchmarks/test_matrix.py --model qwen3.5 --backend llama-server --kv-type turbo4
|
||||
python3 benchmarks/test_matrix.py --quick # Run only 3 prompts for smoke test
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
requests = None # Fallback for testing without requests
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BASELINE_FILE = Path(__file__).parent / "baseline_results.json"
|
||||
RESULTS_DIR = Path(__file__).parent / "results"
|
||||
PROMPTS_FILE = Path(__file__).parent / "test_prompts.json"
|
||||
|
||||
# Quality pass criteria (from issue #11)
|
||||
PPL_DELTA_MAX = 0.5
|
||||
NEEDLE_RETRIEVAL_MIN = 1.0 # 100%
|
||||
PROMPT_QUALITY_MIN = 0.9 # 9/10
|
||||
ATTENTION_SIM_MIN = 0.995
|
||||
|
||||
# Performance pass criteria
|
||||
TOKS_BASELINE_RATIO = 0.90 # >= 90% baseline
|
||||
TTFT_BASELINE_RATIO = 1.10 # <= 110% baseline
|
||||
MEMORY_CEILING_GB = 27.0
|
||||
CONTEXT_CEILING_MIN_K = 64
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test prompts (10 practical prompts from issue #11)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
TEST_PROMPTS = [
|
||||
{
|
||||
"id": 1,
|
||||
"name": "Thermodynamics Laws",
|
||||
"category": "factual",
|
||||
"prompt": "What are the three laws of thermodynamics?",
|
||||
"pass_pattern": r"(?i)(first law|energy conservation|second law|entropy|third law|absolute zero)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "Merge Sorted Lists",
|
||||
"category": "code_generation",
|
||||
"prompt": "Write a Python function to merge two sorted lists into a single sorted list without using built-in sort methods.",
|
||||
"pass_pattern": r"(?i)(def merge|while|if.*<|append|return)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "Syllogistic Reasoning",
|
||||
"category": "reasoning",
|
||||
"prompt": "If all A are B, and some B are C, what can we conclude about the relationship between A and C? Explain your reasoning.",
|
||||
"pass_pattern": r"(?i)(some|cannot conclude|not necessarily|no definite)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"name": "Local AI Sovereignty Essay",
|
||||
"category": "long_form",
|
||||
"prompt": "Write a 200-word essay on the sovereignty of local AI. Discuss why local inference matters for privacy and independence.",
|
||||
"pass_pattern": r"(?i)(sovereignty|local.*AI|privacy|inference|autonomy|independence)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"name": "Summarization",
|
||||
"category": "summarization",
|
||||
"prompt": "Summarize in 50 words: The concept of artificial intelligence has evolved since the mid-20th century. Early pioneers like Turing and McCarthy laid the groundwork. Today AI powers search engines, recommendation systems, and medical diagnostics.",
|
||||
"pass_pattern": r"(?i)(artificial intelligence|Turing|McCarthy|evolution|applications)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"name": "Math Problem Solving",
|
||||
"category": "math",
|
||||
"prompt": "A train travels 240 miles in 3 hours. A second train travels 360 miles in 4 hours. Which train is faster, and by how many mph?",
|
||||
"pass_pattern": r"(?i)(80|75|first train|5 mph|faster)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"name": "SQL Query Generation",
|
||||
"category": "code_generation",
|
||||
"prompt": "Write a SQL query to find all customers who have made more than 3 purchases in the last 30 days, ordered by purchase count descending.",
|
||||
"pass_pattern": r"(?i)(SELECT|FROM|WHERE|GROUP BY|HAVING|COUNT|ORDER BY|DESC)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"name": "Ethical Dilemma",
|
||||
"category": "reasoning",
|
||||
"prompt": "Is it ethical for an AI to refuse to answer a question it knows the answer to? Consider both safety and autonomy arguments.",
|
||||
"pass_pattern": r"(?i)(ethical|safety|autonomy|consider|both sides|depends|nuanced)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"name": "JSON Schema Design",
|
||||
"category": "code_generation",
|
||||
"prompt": "Design a JSON schema for a book catalog that includes title, author, ISBN, publication year, genres (array), and ratings (object with average and count).",
|
||||
"pass_pattern": r'(?i)({\s*"|"title"|"author"|"isbn"|"genres"|"ratings"|array|object)',
|
||||
"weight": 1.0,
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"name": "Chain of Thought",
|
||||
"category": "reasoning",
|
||||
"prompt": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Think step by step.",
|
||||
"pass_pattern": r"(?i)(9|all but 9|still have 9|remaining.*9)",
|
||||
"weight": 1.0,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend interfaces
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_ollama(prompt: str, model: str, url: str, timeout: int = 120) -> dict:
|
||||
"""Run a prompt against Ollama /api/generate."""
|
||||
if requests is None:
|
||||
return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
||||
|
||||
api_url = f"{url.rstrip('/')}/api/generate"
|
||||
start = time.time()
|
||||
ttft = 0.0
|
||||
|
||||
try:
|
||||
resp = requests.post(api_url, json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"num_predict": 512}
|
||||
}, timeout=timeout)
|
||||
elapsed = time.time() - start
|
||||
|
||||
data = resp.json()
|
||||
response_text = data.get("response", "")
|
||||
eval_count = data.get("eval_count", 0)
|
||||
eval_duration = data.get("eval_duration", 1)
|
||||
tok_per_sec = eval_count / (eval_duration / 1e9) if eval_duration > 0 else 0
|
||||
ttft = elapsed * 0.1 # Estimate: ~10% of total time is TTFT for non-streaming
|
||||
|
||||
return {
|
||||
"response": response_text,
|
||||
"ttft": ttft,
|
||||
"tok_per_sec": tok_per_sec,
|
||||
"elapsed": elapsed,
|
||||
"peak_mem_mb": 0,
|
||||
"tokens_generated": eval_count,
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
||||
|
||||
|
||||
def run_llama_server(prompt: str, model: str, url: str, kv_type: str = "fp16", timeout: int = 120) -> dict:
|
||||
"""Run a prompt against llama-server /completion."""
|
||||
if requests is None:
|
||||
return {"error": "requests not installed", "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
||||
|
||||
api_url = f"{url.rstrip('/')}/completion"
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
resp = requests.post(api_url, json={
|
||||
"prompt": prompt,
|
||||
"n_predict": 512,
|
||||
"cache_type_k": kv_type,
|
||||
"cache_type_v": kv_type,
|
||||
}, timeout=timeout)
|
||||
elapsed = time.time() - start
|
||||
|
||||
data = resp.json()
|
||||
response_text = data.get("content", "")
|
||||
tokens_predicted = data.get("tokens_predicted", 0)
|
||||
tok_per_sec = tokens_predicted / elapsed if elapsed > 0 else 0
|
||||
|
||||
return {
|
||||
"response": response_text,
|
||||
"ttft": elapsed * 0.15, # Estimate
|
||||
"tok_per_sec": tok_per_sec,
|
||||
"elapsed": elapsed,
|
||||
"peak_mem_mb": 0,
|
||||
"tokens_generated": tokens_predicted,
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "response": "", "ttft": 0, "tok_per_sec": 0, "peak_mem_mb": 0}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Quality evaluation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def evaluate_quality(response: str, pattern: str) -> dict:
|
||||
"""Evaluate response quality against expected pattern."""
|
||||
match = re.search(pattern, response)
|
||||
return {
|
||||
"matched": match is not None,
|
||||
"pattern": pattern,
|
||||
"response_length": len(response),
|
||||
"has_substance": len(response) > 50,
|
||||
}
|
||||
|
||||
|
||||
def evaluate_performance(result: dict, baseline: dict) -> dict:
|
||||
"""Evaluate performance against baseline."""
|
||||
toks_ratio = result["tok_per_sec"] / max(baseline.get("tok_per_sec", 1), 0.01)
|
||||
ttft_ratio = result["ttft"] / max(baseline.get("ttft", 0.01), 0.01)
|
||||
|
||||
return {
|
||||
"tok_per_sec": result["tok_per_sec"],
|
||||
"tok_per_sec_baseline": baseline.get("tok_per_sec", 0),
|
||||
"tok_per_sec_ratio": round(toks_ratio, 3),
|
||||
"tok_per_sec_pass": toks_ratio >= TOKS_BASELINE_RATIO,
|
||||
"ttft": result["ttft"],
|
||||
"ttft_baseline": baseline.get("ttft", 0),
|
||||
"ttft_ratio": round(ttft_ratio, 3),
|
||||
"ttft_pass": ttft_ratio <= TTFT_BASELINE_RATIO,
|
||||
"peak_mem_mb": result.get("peak_mem_mb", 0),
|
||||
"peak_mem_pass": result.get("peak_mem_mb", 0) / 1024 < MEMORY_CEILING_GB,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test matrix runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_test_matrix(model: str, backend: str, url: str, kv_type: str = "fp16",
|
||||
quick: bool = False, timeout: int = 120) -> dict:
|
||||
"""Run the full test matrix."""
|
||||
prompts = TEST_PROMPTS[:3] if quick else TEST_PROMPTS
|
||||
|
||||
# Load baseline if exists
|
||||
baseline = {}
|
||||
if BASELINE_FILE.exists():
|
||||
try:
|
||||
baseline = json.loads(BASELINE_FILE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
run_fn = run_ollama if backend == "ollama" else run_llama_server
|
||||
results = []
|
||||
pass_count = 0
|
||||
fail_count = 0
|
||||
|
||||
print(f"Running {len(prompts)} prompts against {backend} ({model})...", file=sys.stderr)
|
||||
|
||||
for p in prompts:
|
||||
print(f" [{p['id']}/10] {p['name']}...", file=sys.stderr, end=" ")
|
||||
|
||||
if backend == "ollama":
|
||||
result = run_fn(p["prompt"], model, url, timeout)
|
||||
else:
|
||||
result = run_fn(p["prompt"], model, url, kv_type, timeout)
|
||||
|
||||
if "error" in result:
|
||||
print(f"ERROR: {result['error']}", file=sys.stderr)
|
||||
results.append({"prompt_id": p["id"], "name": p["name"], "error": result["error"]})
|
||||
fail_count += 1
|
||||
continue
|
||||
|
||||
quality = evaluate_quality(result["response"], p["pass_pattern"])
|
||||
perf = evaluate_performance(result, baseline.get(str(p["id"]), {}))
|
||||
|
||||
quality_pass = quality["matched"] and quality["has_substance"]
|
||||
perf_pass = perf.get("tok_per_sec_pass", True) and perf.get("ttft_pass", True)
|
||||
overall_pass = quality_pass and perf_pass
|
||||
|
||||
if overall_pass:
|
||||
pass_count += 1
|
||||
print("PASS", file=sys.stderr)
|
||||
else:
|
||||
fail_count += 1
|
||||
reasons = []
|
||||
if not quality_pass:
|
||||
reasons.append("quality")
|
||||
if not perf_pass:
|
||||
reasons.append("perf")
|
||||
print(f"FAIL ({', '.join(reasons)})", file=sys.stderr)
|
||||
|
||||
results.append({
|
||||
"prompt_id": p["id"],
|
||||
"name": p["name"],
|
||||
"category": p["category"],
|
||||
"quality": quality,
|
||||
"performance": perf,
|
||||
"pass": overall_pass,
|
||||
"response_preview": result["response"][:200],
|
||||
})
|
||||
|
||||
report = {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": model,
|
||||
"backend": backend,
|
||||
"kv_type": kv_type,
|
||||
"total_prompts": len(prompts),
|
||||
"passed": pass_count,
|
||||
"failed": fail_count,
|
||||
"pass_rate": pass_count / len(prompts) if prompts else 0,
|
||||
"quality_pass_rate": sum(1 for r in results if r.get("quality", {}).get("matched", False)) / len(prompts) if prompts else 0,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def report_to_markdown(report: dict) -> str:
|
||||
"""Generate markdown test report."""
|
||||
lines = [
|
||||
f"# TurboQuant Test Matrix Report",
|
||||
"",
|
||||
f"Generated: {report['generated_at'][:16]}",
|
||||
f"Model: {report['model']}",
|
||||
f"Backend: {report['backend']} (KV: {report.get('kv_type', 'fp16')})",
|
||||
"",
|
||||
"## Summary",
|
||||
"",
|
||||
"| Metric | Value |",
|
||||
"|--------|-------|",
|
||||
f"| Total prompts | {report['total_prompts']} |",
|
||||
f"| Passed | {report['passed']} |",
|
||||
f"| Failed | {report['failed']} |",
|
||||
f"| Pass rate | {report['pass_rate']:.0%} |",
|
||||
f"| Quality pass rate | {report['quality_pass_rate']:.0%} |",
|
||||
"",
|
||||
"## Results",
|
||||
"",
|
||||
"| # | Prompt | Category | Quality | Perf tok/s | Pass |",
|
||||
"|---|--------|----------|---------|------------|------|",
|
||||
]
|
||||
|
||||
for r in report["results"]:
|
||||
if "error" in r:
|
||||
lines.append(f"| {r['prompt_id']} | {r['name']} | - | ERROR | - | ❌ |")
|
||||
continue
|
||||
|
||||
q = r.get("quality", {})
|
||||
p = r.get("performance", {})
|
||||
q_icon = "✅" if q.get("matched") else "❌"
|
||||
p_toks = f"{p.get('tok_per_sec', 0):.1f}" if p.get("tok_per_sec") else "-"
|
||||
pass_icon = "✅" if r.get("pass") else "❌"
|
||||
lines.append(f"| {r['prompt_id']} | {r['name']} | {r.get('category', '')} | {q_icon} | {p_toks} | {pass_icon} |")
|
||||
|
||||
lines.extend([
|
||||
"",
|
||||
"## Pass Criteria",
|
||||
"",
|
||||
"| Test | Criteria |",
|
||||
"|------|----------|",
|
||||
f"| Pattern match | >= {PROMPT_QUALITY_MIN:.0%} of prompts match expected patterns |",
|
||||
f"| tok/s | >= {TOKS_BASELINE_RATIO:.0%} of baseline |",
|
||||
f"| TTFT | <= {TTFT_BASELINE_RATIO:.0%} of baseline |",
|
||||
f"| Peak memory | < {MEMORY_CEILING_GB}GB |",
|
||||
])
|
||||
|
||||
# Go/no-go
|
||||
all_pass = report["pass_rate"] >= 0.9
|
||||
lines.extend([
|
||||
"",
|
||||
"## Go/No-Go Decision",
|
||||
"",
|
||||
f"**{'GO ✅' if all_pass else 'NO-GO ❌'}** — {report['passed']}/{report['total_prompts']} prompts passed ({report['pass_rate']:.0%})",
|
||||
])
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="TurboQuant Full Test Matrix")
|
||||
parser.add_argument("--model", default="llama3", help="Model name")
|
||||
parser.add_argument("--backend", default="ollama", choices=["ollama", "llama-server"])
|
||||
parser.add_argument("--url", default="http://localhost:11434", help="Backend URL")
|
||||
parser.add_argument("--kv-type", default="fp16", help="KV cache type (fp16, turbo4, q4_0)")
|
||||
parser.add_argument("--quick", action="store_true", help="Run only 3 prompts")
|
||||
parser.add_argument("--json", action="store_true", help="JSON output")
|
||||
parser.add_argument("--timeout", type=int, default=120, help="Per-prompt timeout")
|
||||
args = parser.parse_args()
|
||||
|
||||
report = run_test_matrix(args.model, args.backend, args.url, args.kv_type, args.quick, args.timeout)
|
||||
|
||||
# Save results
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
result_file = RESULTS_DIR / f"matrix_{args.model}_{args.kv_type}_{ts}.json"
|
||||
result_file.write_text(json.dumps(report, indent=2) + "\n")
|
||||
print(f"Results saved to {result_file}", file=sys.stderr)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print(report_to_markdown(report))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
123
tests/test_test_matrix.py
Normal file
123
tests/test_test_matrix.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Tests for TurboQuant test matrix (Issue #11)."""
|
||||
|
||||
import json
|
||||
import re
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
|
||||
|
||||
from test_matrix import (
|
||||
evaluate_quality,
|
||||
evaluate_performance,
|
||||
report_to_markdown,
|
||||
TEST_PROMPTS,
|
||||
PPL_DELTA_MAX,
|
||||
TOKS_BASELINE_RATIO,
|
||||
TTFT_BASELINE_RATIO,
|
||||
)
|
||||
|
||||
|
||||
class TestEvaluateQuality:
|
||||
def test_pattern_match(self):
|
||||
result = evaluate_quality("The first law of thermodynamics states...", r"(?i)(first law|energy)")
|
||||
assert result["matched"] is True
|
||||
|
||||
def test_pattern_no_match(self):
|
||||
result = evaluate_quality("Hello world", r"(?i)(thermodynamics|entropy)")
|
||||
assert result["matched"] is False
|
||||
|
||||
def test_substance_check(self):
|
||||
result = evaluate_quality("Short", r".*")
|
||||
assert result["has_substance"] is False
|
||||
|
||||
def test_substance_pass(self):
|
||||
result = evaluate_quality("A" * 100, r".*")
|
||||
assert result["has_substance"] is True
|
||||
|
||||
def test_response_length(self):
|
||||
result = evaluate_quality("Hello world", r".*")
|
||||
assert result["response_length"] == 11
|
||||
|
||||
|
||||
class TestEvaluatePerformance:
|
||||
def test_tok_per_sec_pass(self):
|
||||
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000}
|
||||
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||||
perf = evaluate_performance(result, baseline)
|
||||
assert perf["tok_per_sec_pass"] is True
|
||||
|
||||
def test_tok_per_sec_fail(self):
|
||||
result = {"tok_per_sec": 50, "ttft": 0.5, "peak_mem_mb": 1000}
|
||||
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||||
perf = evaluate_performance(result, baseline)
|
||||
assert perf["tok_per_sec_pass"] is False
|
||||
|
||||
def test_ttft_pass(self):
|
||||
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000}
|
||||
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||||
perf = evaluate_performance(result, baseline)
|
||||
assert perf["ttft_pass"] is True
|
||||
|
||||
def test_ttft_fail(self):
|
||||
result = {"tok_per_sec": 100, "ttft": 1.0, "peak_mem_mb": 1000}
|
||||
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||||
perf = evaluate_performance(result, baseline)
|
||||
assert perf["ttft_pass"] is False
|
||||
|
||||
def test_memory_pass(self):
|
||||
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 10000}
|
||||
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||||
perf = evaluate_performance(result, baseline)
|
||||
assert perf["peak_mem_pass"] is True
|
||||
|
||||
|
||||
class TestTestPrompts:
|
||||
def test_has_10_prompts(self):
|
||||
assert len(TEST_PROMPTS) == 10
|
||||
|
||||
def test_all_have_patterns(self):
|
||||
for p in TEST_PROMPTS:
|
||||
assert "pass_pattern" in p
|
||||
# Verify pattern compiles
|
||||
re.compile(p["pass_pattern"])
|
||||
|
||||
def test_all_have_categories(self):
|
||||
categories = {p["category"] for p in TEST_PROMPTS}
|
||||
assert len(categories) >= 4 # At least 4 different categories
|
||||
|
||||
|
||||
class TestReportMarkdown:
|
||||
def test_has_summary(self):
|
||||
report = {
|
||||
"generated_at": "2026-04-14T00:00:00",
|
||||
"model": "test-model",
|
||||
"backend": "ollama",
|
||||
"kv_type": "fp16",
|
||||
"total_prompts": 10,
|
||||
"passed": 9,
|
||||
"failed": 1,
|
||||
"pass_rate": 0.9,
|
||||
"quality_pass_rate": 0.95,
|
||||
"results": [
|
||||
{"prompt_id": 1, "name": "Test", "category": "factual",
|
||||
"quality": {"matched": True}, "performance": {"tok_per_sec": 50},
|
||||
"pass": True}
|
||||
],
|
||||
}
|
||||
md = report_to_markdown(report)
|
||||
assert "Test Matrix Report" in md
|
||||
assert "9" in md # passed
|
||||
assert "GO" in md # 90% pass rate
|
||||
|
||||
def test_nogo_on_low_pass_rate(self):
|
||||
report = {
|
||||
"generated_at": "2026-04-14", "model": "x", "backend": "x", "kv_type": "x",
|
||||
"total_prompts": 10, "passed": 5, "failed": 5, "pass_rate": 0.5,
|
||||
"quality_pass_rate": 0.5, "results": [],
|
||||
}
|
||||
md = report_to_markdown(report)
|
||||
assert "NO-GO" in md
|
||||
Reference in New Issue
Block a user