""" Tests for the Uniwizard Quality Scorer module. Run with: python -m pytest ~/.timmy/uniwizard/test_quality_scorer.py -v """ import sqlite3 import tempfile from pathlib import Path import pytest from quality_scorer import ( QualityScorer, ResponseStatus, TaskType, BACKENDS, BackendScore, print_score_report, print_full_report, get_scorer, record, recommend, ) class TestQualityScorer: """Tests for the QualityScorer class.""" @pytest.fixture def temp_db(self): """Create a temporary database for testing.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = Path(f.name) yield db_path db_path.unlink(missing_ok=True) @pytest.fixture def scorer(self, temp_db): """Create a fresh QualityScorer with temp database.""" return QualityScorer(db_path=temp_db) def test_init_creates_database(self, temp_db): """Test that initialization creates the database and tables.""" scorer = QualityScorer(db_path=temp_db) assert temp_db.exists() # Verify schema conn = sqlite3.connect(str(temp_db)) cursor = conn.execute( "SELECT name FROM sqlite_master WHERE type='table'" ) tables = {row[0] for row in cursor.fetchall()} assert "responses" in tables conn.close() def test_record_response_success(self, scorer): """Test recording a successful response.""" scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=150.0, metadata={"model": "claude-3-opus"} ) score = scorer.get_backend_score("anthropic", TaskType.CODE.value) assert score.total_requests == 1 assert score.success_count == 1 assert score.error_count == 0 def test_record_response_error(self, scorer): """Test recording an error response.""" scorer.record_response( backend="groq", task_type=TaskType.FAST_OPS, status=ResponseStatus.ERROR, latency_ms=500.0, ttft_ms=50.0 ) score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value) assert score.total_requests == 1 assert score.success_count == 0 assert score.error_count == 1 def test_record_response_refusal(self, scorer): """Test recording a refusal response.""" scorer.record_response( backend="gemini", task_type=TaskType.CREATIVE, status=ResponseStatus.REFUSAL, latency_ms=300.0, ttft_ms=100.0 ) score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value) assert score.refusal_count == 1 def test_record_response_timeout(self, scorer): """Test recording a timeout response.""" scorer.record_response( backend="openrouter", task_type=TaskType.RESEARCH, status=ResponseStatus.TIMEOUT, latency_ms=30000.0, ttft_ms=0.0 ) score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value) assert score.timeout_count == 1 def test_record_invalid_backend(self, scorer): """Test that invalid backend raises ValueError.""" with pytest.raises(ValueError, match="Unknown backend"): scorer.record_response( backend="invalid-backend", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=100.0 ) def test_rolling_window_pruning(self, scorer): """Test that old records are pruned beyond window size.""" # Add more than ROLLING_WINDOW_SIZE records for i in range(110): scorer.record_response( backend="kimi-coding", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=float(i), ttft_ms=50.0 ) # Should only have 100 records stats = scorer.get_stats() assert stats["by_backend"]["kimi-coding"] == 100 def test_recommend_backend_basic(self, scorer): """Test backend recommendation with sample data.""" # Add some data for multiple backends for backend in ["anthropic", "groq", "gemini"]: for i in range(10): scorer.record_response( backend=backend, task_type=TaskType.CODE, status=ResponseStatus.SUCCESS if i < 8 else ResponseStatus.ERROR, latency_ms=1000.0 if backend == "anthropic" else 500.0, ttft_ms=200.0 ) recommendations = scorer.recommend_backend(TaskType.CODE.value) # Should return all 7 backends assert len(recommendations) == 7 # Top 3 should have scores top_3 = [b for b, s in recommendations[:3]] assert "groq" in top_3 # Fastest latency should win def test_recommend_backend_insufficient_data(self, scorer): """Test recommendation with insufficient samples.""" # Add only 2 samples for one backend for i in range(2): scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) recommendations = scorer.recommend_backend(TaskType.CODE.value, min_samples=5) # Should penalize low-sample backend anthropic_score = next(s for b, s in recommendations if b == "anthropic") assert anthropic_score < 50 # Penalized for low samples def test_get_all_scores(self, scorer): """Test getting scores for all backends.""" # Add data for some backends for backend in ["anthropic", "groq"]: scorer.record_response( backend=backend, task_type=TaskType.REASONING, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) all_scores = scorer.get_all_scores(TaskType.REASONING.value) assert len(all_scores) == 7 assert all_scores["anthropic"].total_requests == 1 assert all_scores["groq"].total_requests == 1 assert all_scores["gemini"].total_requests == 0 def test_get_task_breakdown(self, scorer): """Test getting per-task breakdown for a backend.""" # Add data for different task types scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) scorer.record_response( backend="anthropic", task_type=TaskType.REASONING, status=ResponseStatus.SUCCESS, latency_ms=2000.0, ttft_ms=300.0 ) breakdown = scorer.get_task_breakdown("anthropic") assert len(breakdown) == 5 # 5 task types assert breakdown["code"].total_requests == 1 assert breakdown["reasoning"].total_requests == 1 def test_score_calculation(self, scorer): """Test the composite score calculation.""" # Add perfect responses for i in range(10): scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=100.0, # Very fast ttft_ms=50.0 ) score = scorer.get_backend_score("anthropic", TaskType.CODE.value) # Should have high score for perfect performance assert score.score > 90 assert score.success_count == 10 assert score.avg_latency_ms == 100.0 def test_score_with_errors(self, scorer): """Test scoring with mixed success/error.""" for i in range(5): scorer.record_response( backend="grok", task_type=TaskType.RESEARCH, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) for i in range(5): scorer.record_response( backend="grok", task_type=TaskType.RESEARCH, status=ResponseStatus.ERROR, latency_ms=500.0, ttft_ms=100.0 ) score = scorer.get_backend_score("grok", TaskType.RESEARCH.value) assert score.total_requests == 10 assert score.success_count == 5 assert score.error_count == 5 # Score: 50% success + low error penalty = ~71 with good latency assert 60 < score.score < 80 def test_p95_calculation(self, scorer): """Test P95 latency calculation.""" # Add latencies from 1ms to 100ms for i in range(1, 101): scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=float(i), ttft_ms=50.0 ) score = scorer.get_backend_score("anthropic", TaskType.CODE.value) # P95 should be around 95 assert 90 <= score.p95_latency_ms <= 100 def test_clear_data(self, scorer): """Test clearing all data.""" scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) scorer.clear_data() stats = scorer.get_stats() assert stats["total_records"] == 0 def test_string_task_type(self, scorer): """Test that string task types work alongside TaskType enum.""" scorer.record_response( backend="openai-codex", task_type="code", # String instead of enum status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0 ) score = scorer.get_backend_score("openai-codex", "code") assert score.total_requests == 1 class TestConvenienceFunctions: """Tests for module-level convenience functions.""" @pytest.fixture def temp_db(self): """Create a temporary database for testing.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = Path(f.name) # Patch the default path import quality_scorer original_path = quality_scorer.DEFAULT_DB_PATH quality_scorer.DEFAULT_DB_PATH = db_path yield db_path quality_scorer.DEFAULT_DB_PATH = original_path db_path.unlink(missing_ok=True) def test_get_scorer(self, temp_db): """Test get_scorer convenience function.""" scorer = get_scorer() assert isinstance(scorer, QualityScorer) def test_record_convenience(self, temp_db): """Test record convenience function.""" record( backend="anthropic", task_type="code", status="success", latency_ms=1000.0, ttft_ms=200.0 ) scorer = get_scorer() score = scorer.get_backend_score("anthropic", "code") assert score.total_requests == 1 def test_recommend_convenience(self, temp_db): """Test recommend convenience function.""" record( backend="anthropic", task_type="code", status="success", latency_ms=1000.0, ttft_ms=200.0 ) recs = recommend("code") assert len(recs) == 7 assert recs[0][0] == "anthropic" # Should rank first since it has data class TestPrintFunctions: """Tests for print/report functions (smoke tests).""" @pytest.fixture def populated_scorer(self): """Create a scorer with demo data.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = Path(f.name) scorer = QualityScorer(db_path=db_path) # Add demo data for all backends import random random.seed(42) for backend in BACKENDS: for task in TaskType: for i in range(20): scorer.record_response( backend=backend, task_type=task.value, status=random.choices( [ResponseStatus.SUCCESS, ResponseStatus.ERROR, ResponseStatus.REFUSAL, ResponseStatus.TIMEOUT], weights=[0.85, 0.08, 0.05, 0.02] )[0], latency_ms=random.gauss( 1000 if backend in ["anthropic", "openai-codex"] else 500, 200 ), ttft_ms=random.gauss(150, 50) ) yield scorer db_path.unlink(missing_ok=True) def test_print_score_report(self, populated_scorer, capsys): """Test print_score_report doesn't crash.""" print_score_report(populated_scorer) captured = capsys.readouterr() assert "UNIWIZARD BACKEND QUALITY SCORES" in captured.out assert "anthropic" in captured.out def test_print_full_report(self, populated_scorer, capsys): """Test print_full_report doesn't crash.""" print_full_report(populated_scorer) captured = capsys.readouterr() assert "PER-TASK SPECIALIZATION" in captured.out assert "RECOMMENDATIONS" in captured.out class TestEdgeCases: """Tests for edge cases and error handling.""" @pytest.fixture def temp_db(self): """Create a temporary database for testing.""" with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: db_path = Path(f.name) yield db_path db_path.unlink(missing_ok=True) @pytest.fixture def scorer(self, temp_db): """Create a fresh QualityScorer with temp database.""" return QualityScorer(db_path=temp_db) def test_empty_database(self, scorer): """Test behavior with empty database.""" score = scorer.get_backend_score("anthropic", TaskType.CODE.value) assert score.total_requests == 0 assert score.score == 0.0 assert score.avg_latency_ms == 0.0 def test_invalid_backend_in_get_score(self, scorer): """Test that invalid backend raises error in get_score.""" with pytest.raises(ValueError, match="Unknown backend"): scorer.get_backend_score("invalid") def test_invalid_backend_in_breakdown(self, scorer): """Test that invalid backend raises error in get_task_breakdown.""" with pytest.raises(ValueError, match="Unknown backend"): scorer.get_task_breakdown("invalid") def test_zero_latency(self, scorer): """Test handling of zero latency.""" scorer.record_response( backend="groq", task_type=TaskType.FAST_OPS, status=ResponseStatus.SUCCESS, latency_ms=0.0, ttft_ms=0.0 ) score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value) assert score.avg_latency_ms == 0.0 assert score.score > 50 # Should still have decent score def test_very_high_latency(self, scorer): """Test handling of very high latency.""" scorer.record_response( backend="openrouter", task_type=TaskType.RESEARCH, status=ResponseStatus.SUCCESS, latency_ms=50000.0, # 50 seconds ttft_ms=5000.0 ) score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value) # Success rate is 100% but latency penalty brings it down assert score.score < 85 # Should be penalized for high latency def test_all_error_responses(self, scorer): """Test scoring when all responses are errors.""" for i in range(10): scorer.record_response( backend="gemini", task_type=TaskType.CODE, status=ResponseStatus.ERROR, latency_ms=1000.0, ttft_ms=200.0 ) score = scorer.get_backend_score("gemini", TaskType.CODE.value) # 0% success but perfect error/refusal/timeout rate = ~35 assert score.score < 45 # Should have low score def test_all_refusal_responses(self, scorer): """Test scoring when all responses are refusals.""" for i in range(10): scorer.record_response( backend="gemini", task_type=TaskType.CREATIVE, status=ResponseStatus.REFUSAL, latency_ms=500.0, ttft_ms=100.0 ) score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value) assert score.refusal_count == 10 # 0% success, 0% error, 100% refusal, good latency = ~49 assert score.score < 55 # Should be low due to refusals def test_metadata_storage(self, scorer): """Test that metadata is stored correctly.""" scorer.record_response( backend="anthropic", task_type=TaskType.CODE, status=ResponseStatus.SUCCESS, latency_ms=1000.0, ttft_ms=200.0, metadata={"model": "claude-3-opus", "region": "us-east-1"} ) # Verify in database conn = sqlite3.connect(str(scorer.db_path)) row = conn.execute("SELECT metadata FROM responses LIMIT 1").fetchone() conn.close() import json metadata = json.loads(row[0]) assert metadata["model"] == "claude-3-opus" if __name__ == "__main__": pytest.main([__file__, "-v"])