Files
timmy-home/uniwizard/test_quality_scorer.py

535 lines
18 KiB
Python
Raw Normal View History

"""
Tests for the Uniwizard Quality Scorer module.
Run with: python -m pytest ~/.timmy/uniwizard/test_quality_scorer.py -v
"""
import sqlite3
import tempfile
from pathlib import Path
import pytest
from quality_scorer import (
QualityScorer,
ResponseStatus,
TaskType,
BACKENDS,
BackendScore,
print_score_report,
print_full_report,
get_scorer,
record,
recommend,
)
class TestQualityScorer:
"""Tests for the QualityScorer class."""
@pytest.fixture
def temp_db(self):
"""Create a temporary database for testing."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
yield db_path
db_path.unlink(missing_ok=True)
@pytest.fixture
def scorer(self, temp_db):
"""Create a fresh QualityScorer with temp database."""
return QualityScorer(db_path=temp_db)
def test_init_creates_database(self, temp_db):
"""Test that initialization creates the database and tables."""
scorer = QualityScorer(db_path=temp_db)
assert temp_db.exists()
# Verify schema
conn = sqlite3.connect(str(temp_db))
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table'"
)
tables = {row[0] for row in cursor.fetchall()}
assert "responses" in tables
conn.close()
def test_record_response_success(self, scorer):
"""Test recording a successful response."""
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=150.0,
metadata={"model": "claude-3-opus"}
)
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
assert score.total_requests == 1
assert score.success_count == 1
assert score.error_count == 0
def test_record_response_error(self, scorer):
"""Test recording an error response."""
scorer.record_response(
backend="groq",
task_type=TaskType.FAST_OPS,
status=ResponseStatus.ERROR,
latency_ms=500.0,
ttft_ms=50.0
)
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
assert score.total_requests == 1
assert score.success_count == 0
assert score.error_count == 1
def test_record_response_refusal(self, scorer):
"""Test recording a refusal response."""
scorer.record_response(
backend="gemini",
task_type=TaskType.CREATIVE,
status=ResponseStatus.REFUSAL,
latency_ms=300.0,
ttft_ms=100.0
)
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
assert score.refusal_count == 1
def test_record_response_timeout(self, scorer):
"""Test recording a timeout response."""
scorer.record_response(
backend="openrouter",
task_type=TaskType.RESEARCH,
status=ResponseStatus.TIMEOUT,
latency_ms=30000.0,
ttft_ms=0.0
)
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
assert score.timeout_count == 1
def test_record_invalid_backend(self, scorer):
"""Test that invalid backend raises ValueError."""
with pytest.raises(ValueError, match="Unknown backend"):
scorer.record_response(
backend="invalid-backend",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=100.0
)
def test_rolling_window_pruning(self, scorer):
"""Test that old records are pruned beyond window size."""
# Add more than ROLLING_WINDOW_SIZE records
for i in range(110):
scorer.record_response(
backend="kimi-coding",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=float(i),
ttft_ms=50.0
)
# Should only have 100 records
stats = scorer.get_stats()
assert stats["by_backend"]["kimi-coding"] == 100
def test_recommend_backend_basic(self, scorer):
"""Test backend recommendation with sample data."""
# Add some data for multiple backends
for backend in ["anthropic", "groq", "gemini"]:
for i in range(10):
scorer.record_response(
backend=backend,
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS if i < 8 else ResponseStatus.ERROR,
latency_ms=1000.0 if backend == "anthropic" else 500.0,
ttft_ms=200.0
)
recommendations = scorer.recommend_backend(TaskType.CODE.value)
# Should return all 7 backends
assert len(recommendations) == 7
# Top 3 should have scores
top_3 = [b for b, s in recommendations[:3]]
assert "groq" in top_3 # Fastest latency should win
def test_recommend_backend_insufficient_data(self, scorer):
"""Test recommendation with insufficient samples."""
# Add only 2 samples for one backend
for i in range(2):
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
recommendations = scorer.recommend_backend(TaskType.CODE.value, min_samples=5)
# Should penalize low-sample backend
anthropic_score = next(s for b, s in recommendations if b == "anthropic")
assert anthropic_score < 50 # Penalized for low samples
def test_get_all_scores(self, scorer):
"""Test getting scores for all backends."""
# Add data for some backends
for backend in ["anthropic", "groq"]:
scorer.record_response(
backend=backend,
task_type=TaskType.REASONING,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
all_scores = scorer.get_all_scores(TaskType.REASONING.value)
assert len(all_scores) == 7
assert all_scores["anthropic"].total_requests == 1
assert all_scores["groq"].total_requests == 1
assert all_scores["gemini"].total_requests == 0
def test_get_task_breakdown(self, scorer):
"""Test getting per-task breakdown for a backend."""
# Add data for different task types
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
scorer.record_response(
backend="anthropic",
task_type=TaskType.REASONING,
status=ResponseStatus.SUCCESS,
latency_ms=2000.0,
ttft_ms=300.0
)
breakdown = scorer.get_task_breakdown("anthropic")
assert len(breakdown) == 5 # 5 task types
assert breakdown["code"].total_requests == 1
assert breakdown["reasoning"].total_requests == 1
def test_score_calculation(self, scorer):
"""Test the composite score calculation."""
# Add perfect responses
for i in range(10):
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=100.0, # Very fast
ttft_ms=50.0
)
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
# Should have high score for perfect performance
assert score.score > 90
assert score.success_count == 10
assert score.avg_latency_ms == 100.0
def test_score_with_errors(self, scorer):
"""Test scoring with mixed success/error."""
for i in range(5):
scorer.record_response(
backend="grok",
task_type=TaskType.RESEARCH,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
for i in range(5):
scorer.record_response(
backend="grok",
task_type=TaskType.RESEARCH,
status=ResponseStatus.ERROR,
latency_ms=500.0,
ttft_ms=100.0
)
score = scorer.get_backend_score("grok", TaskType.RESEARCH.value)
assert score.total_requests == 10
assert score.success_count == 5
assert score.error_count == 5
# Score: 50% success + low error penalty = ~71 with good latency
assert 60 < score.score < 80
def test_p95_calculation(self, scorer):
"""Test P95 latency calculation."""
# Add latencies from 1ms to 100ms
for i in range(1, 101):
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=float(i),
ttft_ms=50.0
)
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
# P95 should be around 95
assert 90 <= score.p95_latency_ms <= 100
def test_clear_data(self, scorer):
"""Test clearing all data."""
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
scorer.clear_data()
stats = scorer.get_stats()
assert stats["total_records"] == 0
def test_string_task_type(self, scorer):
"""Test that string task types work alongside TaskType enum."""
scorer.record_response(
backend="openai-codex",
task_type="code", # String instead of enum
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0
)
score = scorer.get_backend_score("openai-codex", "code")
assert score.total_requests == 1
class TestConvenienceFunctions:
"""Tests for module-level convenience functions."""
@pytest.fixture
def temp_db(self):
"""Create a temporary database for testing."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
# Patch the default path
import quality_scorer
original_path = quality_scorer.DEFAULT_DB_PATH
quality_scorer.DEFAULT_DB_PATH = db_path
yield db_path
quality_scorer.DEFAULT_DB_PATH = original_path
db_path.unlink(missing_ok=True)
def test_get_scorer(self, temp_db):
"""Test get_scorer convenience function."""
scorer = get_scorer()
assert isinstance(scorer, QualityScorer)
def test_record_convenience(self, temp_db):
"""Test record convenience function."""
record(
backend="anthropic",
task_type="code",
status="success",
latency_ms=1000.0,
ttft_ms=200.0
)
scorer = get_scorer()
score = scorer.get_backend_score("anthropic", "code")
assert score.total_requests == 1
def test_recommend_convenience(self, temp_db):
"""Test recommend convenience function."""
record(
backend="anthropic",
task_type="code",
status="success",
latency_ms=1000.0,
ttft_ms=200.0
)
recs = recommend("code")
assert len(recs) == 7
assert recs[0][0] == "anthropic" # Should rank first since it has data
class TestPrintFunctions:
"""Tests for print/report functions (smoke tests)."""
@pytest.fixture
def populated_scorer(self):
"""Create a scorer with demo data."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
scorer = QualityScorer(db_path=db_path)
# Add demo data for all backends
import random
random.seed(42)
for backend in BACKENDS:
for task in TaskType:
for i in range(20):
scorer.record_response(
backend=backend,
task_type=task.value,
status=random.choices(
[ResponseStatus.SUCCESS, ResponseStatus.ERROR,
ResponseStatus.REFUSAL, ResponseStatus.TIMEOUT],
weights=[0.85, 0.08, 0.05, 0.02]
)[0],
latency_ms=random.gauss(
1000 if backend in ["anthropic", "openai-codex"] else 500,
200
),
ttft_ms=random.gauss(150, 50)
)
yield scorer
db_path.unlink(missing_ok=True)
def test_print_score_report(self, populated_scorer, capsys):
"""Test print_score_report doesn't crash."""
print_score_report(populated_scorer)
captured = capsys.readouterr()
assert "UNIWIZARD BACKEND QUALITY SCORES" in captured.out
assert "anthropic" in captured.out
def test_print_full_report(self, populated_scorer, capsys):
"""Test print_full_report doesn't crash."""
print_full_report(populated_scorer)
captured = capsys.readouterr()
assert "PER-TASK SPECIALIZATION" in captured.out
assert "RECOMMENDATIONS" in captured.out
class TestEdgeCases:
"""Tests for edge cases and error handling."""
@pytest.fixture
def temp_db(self):
"""Create a temporary database for testing."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
yield db_path
db_path.unlink(missing_ok=True)
@pytest.fixture
def scorer(self, temp_db):
"""Create a fresh QualityScorer with temp database."""
return QualityScorer(db_path=temp_db)
def test_empty_database(self, scorer):
"""Test behavior with empty database."""
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
assert score.total_requests == 0
assert score.score == 0.0
assert score.avg_latency_ms == 0.0
def test_invalid_backend_in_get_score(self, scorer):
"""Test that invalid backend raises error in get_score."""
with pytest.raises(ValueError, match="Unknown backend"):
scorer.get_backend_score("invalid")
def test_invalid_backend_in_breakdown(self, scorer):
"""Test that invalid backend raises error in get_task_breakdown."""
with pytest.raises(ValueError, match="Unknown backend"):
scorer.get_task_breakdown("invalid")
def test_zero_latency(self, scorer):
"""Test handling of zero latency."""
scorer.record_response(
backend="groq",
task_type=TaskType.FAST_OPS,
status=ResponseStatus.SUCCESS,
latency_ms=0.0,
ttft_ms=0.0
)
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
assert score.avg_latency_ms == 0.0
assert score.score > 50 # Should still have decent score
def test_very_high_latency(self, scorer):
"""Test handling of very high latency."""
scorer.record_response(
backend="openrouter",
task_type=TaskType.RESEARCH,
status=ResponseStatus.SUCCESS,
latency_ms=50000.0, # 50 seconds
ttft_ms=5000.0
)
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
# Success rate is 100% but latency penalty brings it down
assert score.score < 85 # Should be penalized for high latency
def test_all_error_responses(self, scorer):
"""Test scoring when all responses are errors."""
for i in range(10):
scorer.record_response(
backend="gemini",
task_type=TaskType.CODE,
status=ResponseStatus.ERROR,
latency_ms=1000.0,
ttft_ms=200.0
)
score = scorer.get_backend_score("gemini", TaskType.CODE.value)
# 0% success but perfect error/refusal/timeout rate = ~35
assert score.score < 45 # Should have low score
def test_all_refusal_responses(self, scorer):
"""Test scoring when all responses are refusals."""
for i in range(10):
scorer.record_response(
backend="gemini",
task_type=TaskType.CREATIVE,
status=ResponseStatus.REFUSAL,
latency_ms=500.0,
ttft_ms=100.0
)
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
assert score.refusal_count == 10
# 0% success, 0% error, 100% refusal, good latency = ~49
assert score.score < 55 # Should be low due to refusals
def test_metadata_storage(self, scorer):
"""Test that metadata is stored correctly."""
scorer.record_response(
backend="anthropic",
task_type=TaskType.CODE,
status=ResponseStatus.SUCCESS,
latency_ms=1000.0,
ttft_ms=200.0,
metadata={"model": "claude-3-opus", "region": "us-east-1"}
)
# Verify in database
conn = sqlite3.connect(str(scorer.db_path))
row = conn.execute("SELECT metadata FROM responses LIMIT 1").fetchone()
conn.close()
import json
metadata = json.loads(row[0])
assert metadata["model"] == "claude-3-opus"
if __name__ == "__main__":
pytest.main([__file__, "-v"])