124 lines
4.0 KiB
Python
124 lines
4.0 KiB
Python
|
|
"""Tests for TurboQuant test matrix (Issue #11)."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
from unittest.mock import patch, MagicMock
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
|
||
|
|
|
||
|
|
from test_matrix import (
|
||
|
|
evaluate_quality,
|
||
|
|
evaluate_performance,
|
||
|
|
report_to_markdown,
|
||
|
|
TEST_PROMPTS,
|
||
|
|
PPL_DELTA_MAX,
|
||
|
|
TOKS_BASELINE_RATIO,
|
||
|
|
TTFT_BASELINE_RATIO,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class TestEvaluateQuality:
|
||
|
|
def test_pattern_match(self):
|
||
|
|
result = evaluate_quality("The first law of thermodynamics states...", r"(?i)(first law|energy)")
|
||
|
|
assert result["matched"] is True
|
||
|
|
|
||
|
|
def test_pattern_no_match(self):
|
||
|
|
result = evaluate_quality("Hello world", r"(?i)(thermodynamics|entropy)")
|
||
|
|
assert result["matched"] is False
|
||
|
|
|
||
|
|
def test_substance_check(self):
|
||
|
|
result = evaluate_quality("Short", r".*")
|
||
|
|
assert result["has_substance"] is False
|
||
|
|
|
||
|
|
def test_substance_pass(self):
|
||
|
|
result = evaluate_quality("A" * 100, r".*")
|
||
|
|
assert result["has_substance"] is True
|
||
|
|
|
||
|
|
def test_response_length(self):
|
||
|
|
result = evaluate_quality("Hello world", r".*")
|
||
|
|
assert result["response_length"] == 11
|
||
|
|
|
||
|
|
|
||
|
|
class TestEvaluatePerformance:
|
||
|
|
def test_tok_per_sec_pass(self):
|
||
|
|
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000}
|
||
|
|
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||
|
|
perf = evaluate_performance(result, baseline)
|
||
|
|
assert perf["tok_per_sec_pass"] is True
|
||
|
|
|
||
|
|
def test_tok_per_sec_fail(self):
|
||
|
|
result = {"tok_per_sec": 50, "ttft": 0.5, "peak_mem_mb": 1000}
|
||
|
|
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||
|
|
perf = evaluate_performance(result, baseline)
|
||
|
|
assert perf["tok_per_sec_pass"] is False
|
||
|
|
|
||
|
|
def test_ttft_pass(self):
|
||
|
|
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 1000}
|
||
|
|
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||
|
|
perf = evaluate_performance(result, baseline)
|
||
|
|
assert perf["ttft_pass"] is True
|
||
|
|
|
||
|
|
def test_ttft_fail(self):
|
||
|
|
result = {"tok_per_sec": 100, "ttft": 1.0, "peak_mem_mb": 1000}
|
||
|
|
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||
|
|
perf = evaluate_performance(result, baseline)
|
||
|
|
assert perf["ttft_pass"] is False
|
||
|
|
|
||
|
|
def test_memory_pass(self):
|
||
|
|
result = {"tok_per_sec": 100, "ttft": 0.5, "peak_mem_mb": 10000}
|
||
|
|
baseline = {"tok_per_sec": 100, "ttft": 0.5}
|
||
|
|
perf = evaluate_performance(result, baseline)
|
||
|
|
assert perf["peak_mem_pass"] is True
|
||
|
|
|
||
|
|
|
||
|
|
class TestTestPrompts:
|
||
|
|
def test_has_10_prompts(self):
|
||
|
|
assert len(TEST_PROMPTS) == 10
|
||
|
|
|
||
|
|
def test_all_have_patterns(self):
|
||
|
|
for p in TEST_PROMPTS:
|
||
|
|
assert "pass_pattern" in p
|
||
|
|
# Verify pattern compiles
|
||
|
|
re.compile(p["pass_pattern"])
|
||
|
|
|
||
|
|
def test_all_have_categories(self):
|
||
|
|
categories = {p["category"] for p in TEST_PROMPTS}
|
||
|
|
assert len(categories) >= 4 # At least 4 different categories
|
||
|
|
|
||
|
|
|
||
|
|
class TestReportMarkdown:
|
||
|
|
def test_has_summary(self):
|
||
|
|
report = {
|
||
|
|
"generated_at": "2026-04-14T00:00:00",
|
||
|
|
"model": "test-model",
|
||
|
|
"backend": "ollama",
|
||
|
|
"kv_type": "fp16",
|
||
|
|
"total_prompts": 10,
|
||
|
|
"passed": 9,
|
||
|
|
"failed": 1,
|
||
|
|
"pass_rate": 0.9,
|
||
|
|
"quality_pass_rate": 0.95,
|
||
|
|
"results": [
|
||
|
|
{"prompt_id": 1, "name": "Test", "category": "factual",
|
||
|
|
"quality": {"matched": True}, "performance": {"tok_per_sec": 50},
|
||
|
|
"pass": True}
|
||
|
|
],
|
||
|
|
}
|
||
|
|
md = report_to_markdown(report)
|
||
|
|
assert "Test Matrix Report" in md
|
||
|
|
assert "9" in md # passed
|
||
|
|
assert "GO" in md # 90% pass rate
|
||
|
|
|
||
|
|
def test_nogo_on_low_pass_rate(self):
|
||
|
|
report = {
|
||
|
|
"generated_at": "2026-04-14", "model": "x", "backend": "x", "kv_type": "x",
|
||
|
|
"total_prompts": 10, "passed": 5, "failed": 5, "pass_rate": 0.5,
|
||
|
|
"quality_pass_rate": 0.5, "results": [],
|
||
|
|
}
|
||
|
|
md = report_to_markdown(report)
|
||
|
|
assert "NO-GO" in md
|