535 lines
18 KiB
Python
535 lines
18 KiB
Python
|
|
"""
|
||
|
|
Tests for the Uniwizard Quality Scorer module.
|
||
|
|
|
||
|
|
Run with: python -m pytest ~/.timmy/uniwizard/test_quality_scorer.py -v
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sqlite3
|
||
|
|
import tempfile
|
||
|
|
from pathlib import Path
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from quality_scorer import (
|
||
|
|
QualityScorer,
|
||
|
|
ResponseStatus,
|
||
|
|
TaskType,
|
||
|
|
BACKENDS,
|
||
|
|
BackendScore,
|
||
|
|
print_score_report,
|
||
|
|
print_full_report,
|
||
|
|
get_scorer,
|
||
|
|
record,
|
||
|
|
recommend,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class TestQualityScorer:
|
||
|
|
"""Tests for the QualityScorer class."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def temp_db(self):
|
||
|
|
"""Create a temporary database for testing."""
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||
|
|
db_path = Path(f.name)
|
||
|
|
yield db_path
|
||
|
|
db_path.unlink(missing_ok=True)
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def scorer(self, temp_db):
|
||
|
|
"""Create a fresh QualityScorer with temp database."""
|
||
|
|
return QualityScorer(db_path=temp_db)
|
||
|
|
|
||
|
|
def test_init_creates_database(self, temp_db):
|
||
|
|
"""Test that initialization creates the database and tables."""
|
||
|
|
scorer = QualityScorer(db_path=temp_db)
|
||
|
|
assert temp_db.exists()
|
||
|
|
|
||
|
|
# Verify schema
|
||
|
|
conn = sqlite3.connect(str(temp_db))
|
||
|
|
cursor = conn.execute(
|
||
|
|
"SELECT name FROM sqlite_master WHERE type='table'"
|
||
|
|
)
|
||
|
|
tables = {row[0] for row in cursor.fetchall()}
|
||
|
|
assert "responses" in tables
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
def test_record_response_success(self, scorer):
|
||
|
|
"""Test recording a successful response."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=150.0,
|
||
|
|
metadata={"model": "claude-3-opus"}
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||
|
|
assert score.total_requests == 1
|
||
|
|
assert score.success_count == 1
|
||
|
|
assert score.error_count == 0
|
||
|
|
|
||
|
|
def test_record_response_error(self, scorer):
|
||
|
|
"""Test recording an error response."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="groq",
|
||
|
|
task_type=TaskType.FAST_OPS,
|
||
|
|
status=ResponseStatus.ERROR,
|
||
|
|
latency_ms=500.0,
|
||
|
|
ttft_ms=50.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
|
||
|
|
assert score.total_requests == 1
|
||
|
|
assert score.success_count == 0
|
||
|
|
assert score.error_count == 1
|
||
|
|
|
||
|
|
def test_record_response_refusal(self, scorer):
|
||
|
|
"""Test recording a refusal response."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="gemini",
|
||
|
|
task_type=TaskType.CREATIVE,
|
||
|
|
status=ResponseStatus.REFUSAL,
|
||
|
|
latency_ms=300.0,
|
||
|
|
ttft_ms=100.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
|
||
|
|
assert score.refusal_count == 1
|
||
|
|
|
||
|
|
def test_record_response_timeout(self, scorer):
|
||
|
|
"""Test recording a timeout response."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="openrouter",
|
||
|
|
task_type=TaskType.RESEARCH,
|
||
|
|
status=ResponseStatus.TIMEOUT,
|
||
|
|
latency_ms=30000.0,
|
||
|
|
ttft_ms=0.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
|
||
|
|
assert score.timeout_count == 1
|
||
|
|
|
||
|
|
def test_record_invalid_backend(self, scorer):
|
||
|
|
"""Test that invalid backend raises ValueError."""
|
||
|
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="invalid-backend",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=100.0
|
||
|
|
)
|
||
|
|
|
||
|
|
def test_rolling_window_pruning(self, scorer):
|
||
|
|
"""Test that old records are pruned beyond window size."""
|
||
|
|
# Add more than ROLLING_WINDOW_SIZE records
|
||
|
|
for i in range(110):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="kimi-coding",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=float(i),
|
||
|
|
ttft_ms=50.0
|
||
|
|
)
|
||
|
|
|
||
|
|
# Should only have 100 records
|
||
|
|
stats = scorer.get_stats()
|
||
|
|
assert stats["by_backend"]["kimi-coding"] == 100
|
||
|
|
|
||
|
|
def test_recommend_backend_basic(self, scorer):
|
||
|
|
"""Test backend recommendation with sample data."""
|
||
|
|
# Add some data for multiple backends
|
||
|
|
for backend in ["anthropic", "groq", "gemini"]:
|
||
|
|
for i in range(10):
|
||
|
|
scorer.record_response(
|
||
|
|
backend=backend,
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS if i < 8 else ResponseStatus.ERROR,
|
||
|
|
latency_ms=1000.0 if backend == "anthropic" else 500.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
recommendations = scorer.recommend_backend(TaskType.CODE.value)
|
||
|
|
|
||
|
|
# Should return all 7 backends
|
||
|
|
assert len(recommendations) == 7
|
||
|
|
|
||
|
|
# Top 3 should have scores
|
||
|
|
top_3 = [b for b, s in recommendations[:3]]
|
||
|
|
assert "groq" in top_3 # Fastest latency should win
|
||
|
|
|
||
|
|
def test_recommend_backend_insufficient_data(self, scorer):
|
||
|
|
"""Test recommendation with insufficient samples."""
|
||
|
|
# Add only 2 samples for one backend
|
||
|
|
for i in range(2):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
recommendations = scorer.recommend_backend(TaskType.CODE.value, min_samples=5)
|
||
|
|
|
||
|
|
# Should penalize low-sample backend
|
||
|
|
anthropic_score = next(s for b, s in recommendations if b == "anthropic")
|
||
|
|
assert anthropic_score < 50 # Penalized for low samples
|
||
|
|
|
||
|
|
def test_get_all_scores(self, scorer):
|
||
|
|
"""Test getting scores for all backends."""
|
||
|
|
# Add data for some backends
|
||
|
|
for backend in ["anthropic", "groq"]:
|
||
|
|
scorer.record_response(
|
||
|
|
backend=backend,
|
||
|
|
task_type=TaskType.REASONING,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
all_scores = scorer.get_all_scores(TaskType.REASONING.value)
|
||
|
|
|
||
|
|
assert len(all_scores) == 7
|
||
|
|
assert all_scores["anthropic"].total_requests == 1
|
||
|
|
assert all_scores["groq"].total_requests == 1
|
||
|
|
assert all_scores["gemini"].total_requests == 0
|
||
|
|
|
||
|
|
def test_get_task_breakdown(self, scorer):
|
||
|
|
"""Test getting per-task breakdown for a backend."""
|
||
|
|
# Add data for different task types
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.REASONING,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=2000.0,
|
||
|
|
ttft_ms=300.0
|
||
|
|
)
|
||
|
|
|
||
|
|
breakdown = scorer.get_task_breakdown("anthropic")
|
||
|
|
|
||
|
|
assert len(breakdown) == 5 # 5 task types
|
||
|
|
assert breakdown["code"].total_requests == 1
|
||
|
|
assert breakdown["reasoning"].total_requests == 1
|
||
|
|
|
||
|
|
def test_score_calculation(self, scorer):
|
||
|
|
"""Test the composite score calculation."""
|
||
|
|
# Add perfect responses
|
||
|
|
for i in range(10):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=100.0, # Very fast
|
||
|
|
ttft_ms=50.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||
|
|
|
||
|
|
# Should have high score for perfect performance
|
||
|
|
assert score.score > 90
|
||
|
|
assert score.success_count == 10
|
||
|
|
assert score.avg_latency_ms == 100.0
|
||
|
|
|
||
|
|
def test_score_with_errors(self, scorer):
|
||
|
|
"""Test scoring with mixed success/error."""
|
||
|
|
for i in range(5):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="grok",
|
||
|
|
task_type=TaskType.RESEARCH,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
for i in range(5):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="grok",
|
||
|
|
task_type=TaskType.RESEARCH,
|
||
|
|
status=ResponseStatus.ERROR,
|
||
|
|
latency_ms=500.0,
|
||
|
|
ttft_ms=100.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("grok", TaskType.RESEARCH.value)
|
||
|
|
|
||
|
|
assert score.total_requests == 10
|
||
|
|
assert score.success_count == 5
|
||
|
|
assert score.error_count == 5
|
||
|
|
# Score: 50% success + low error penalty = ~71 with good latency
|
||
|
|
assert 60 < score.score < 80
|
||
|
|
|
||
|
|
def test_p95_calculation(self, scorer):
|
||
|
|
"""Test P95 latency calculation."""
|
||
|
|
# Add latencies from 1ms to 100ms
|
||
|
|
for i in range(1, 101):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=float(i),
|
||
|
|
ttft_ms=50.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||
|
|
|
||
|
|
# P95 should be around 95
|
||
|
|
assert 90 <= score.p95_latency_ms <= 100
|
||
|
|
|
||
|
|
def test_clear_data(self, scorer):
|
||
|
|
"""Test clearing all data."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
scorer.clear_data()
|
||
|
|
|
||
|
|
stats = scorer.get_stats()
|
||
|
|
assert stats["total_records"] == 0
|
||
|
|
|
||
|
|
def test_string_task_type(self, scorer):
|
||
|
|
"""Test that string task types work alongside TaskType enum."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="openai-codex",
|
||
|
|
task_type="code", # String instead of enum
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("openai-codex", "code")
|
||
|
|
assert score.total_requests == 1
|
||
|
|
|
||
|
|
|
||
|
|
class TestConvenienceFunctions:
|
||
|
|
"""Tests for module-level convenience functions."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def temp_db(self):
|
||
|
|
"""Create a temporary database for testing."""
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||
|
|
db_path = Path(f.name)
|
||
|
|
|
||
|
|
# Patch the default path
|
||
|
|
import quality_scorer
|
||
|
|
original_path = quality_scorer.DEFAULT_DB_PATH
|
||
|
|
quality_scorer.DEFAULT_DB_PATH = db_path
|
||
|
|
|
||
|
|
yield db_path
|
||
|
|
|
||
|
|
quality_scorer.DEFAULT_DB_PATH = original_path
|
||
|
|
db_path.unlink(missing_ok=True)
|
||
|
|
|
||
|
|
def test_get_scorer(self, temp_db):
|
||
|
|
"""Test get_scorer convenience function."""
|
||
|
|
scorer = get_scorer()
|
||
|
|
assert isinstance(scorer, QualityScorer)
|
||
|
|
|
||
|
|
def test_record_convenience(self, temp_db):
|
||
|
|
"""Test record convenience function."""
|
||
|
|
record(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type="code",
|
||
|
|
status="success",
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
scorer = get_scorer()
|
||
|
|
score = scorer.get_backend_score("anthropic", "code")
|
||
|
|
assert score.total_requests == 1
|
||
|
|
|
||
|
|
def test_recommend_convenience(self, temp_db):
|
||
|
|
"""Test recommend convenience function."""
|
||
|
|
record(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type="code",
|
||
|
|
status="success",
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
recs = recommend("code")
|
||
|
|
assert len(recs) == 7
|
||
|
|
assert recs[0][0] == "anthropic" # Should rank first since it has data
|
||
|
|
|
||
|
|
|
||
|
|
class TestPrintFunctions:
|
||
|
|
"""Tests for print/report functions (smoke tests)."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def populated_scorer(self):
|
||
|
|
"""Create a scorer with demo data."""
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||
|
|
db_path = Path(f.name)
|
||
|
|
|
||
|
|
scorer = QualityScorer(db_path=db_path)
|
||
|
|
|
||
|
|
# Add demo data for all backends
|
||
|
|
import random
|
||
|
|
random.seed(42)
|
||
|
|
|
||
|
|
for backend in BACKENDS:
|
||
|
|
for task in TaskType:
|
||
|
|
for i in range(20):
|
||
|
|
scorer.record_response(
|
||
|
|
backend=backend,
|
||
|
|
task_type=task.value,
|
||
|
|
status=random.choices(
|
||
|
|
[ResponseStatus.SUCCESS, ResponseStatus.ERROR,
|
||
|
|
ResponseStatus.REFUSAL, ResponseStatus.TIMEOUT],
|
||
|
|
weights=[0.85, 0.08, 0.05, 0.02]
|
||
|
|
)[0],
|
||
|
|
latency_ms=random.gauss(
|
||
|
|
1000 if backend in ["anthropic", "openai-codex"] else 500,
|
||
|
|
200
|
||
|
|
),
|
||
|
|
ttft_ms=random.gauss(150, 50)
|
||
|
|
)
|
||
|
|
|
||
|
|
yield scorer
|
||
|
|
db_path.unlink(missing_ok=True)
|
||
|
|
|
||
|
|
def test_print_score_report(self, populated_scorer, capsys):
|
||
|
|
"""Test print_score_report doesn't crash."""
|
||
|
|
print_score_report(populated_scorer)
|
||
|
|
captured = capsys.readouterr()
|
||
|
|
assert "UNIWIZARD BACKEND QUALITY SCORES" in captured.out
|
||
|
|
assert "anthropic" in captured.out
|
||
|
|
|
||
|
|
def test_print_full_report(self, populated_scorer, capsys):
|
||
|
|
"""Test print_full_report doesn't crash."""
|
||
|
|
print_full_report(populated_scorer)
|
||
|
|
captured = capsys.readouterr()
|
||
|
|
assert "PER-TASK SPECIALIZATION" in captured.out
|
||
|
|
assert "RECOMMENDATIONS" in captured.out
|
||
|
|
|
||
|
|
|
||
|
|
class TestEdgeCases:
|
||
|
|
"""Tests for edge cases and error handling."""
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def temp_db(self):
|
||
|
|
"""Create a temporary database for testing."""
|
||
|
|
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
||
|
|
db_path = Path(f.name)
|
||
|
|
yield db_path
|
||
|
|
db_path.unlink(missing_ok=True)
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def scorer(self, temp_db):
|
||
|
|
"""Create a fresh QualityScorer with temp database."""
|
||
|
|
return QualityScorer(db_path=temp_db)
|
||
|
|
|
||
|
|
def test_empty_database(self, scorer):
|
||
|
|
"""Test behavior with empty database."""
|
||
|
|
score = scorer.get_backend_score("anthropic", TaskType.CODE.value)
|
||
|
|
|
||
|
|
assert score.total_requests == 0
|
||
|
|
assert score.score == 0.0
|
||
|
|
assert score.avg_latency_ms == 0.0
|
||
|
|
|
||
|
|
def test_invalid_backend_in_get_score(self, scorer):
|
||
|
|
"""Test that invalid backend raises error in get_score."""
|
||
|
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||
|
|
scorer.get_backend_score("invalid")
|
||
|
|
|
||
|
|
def test_invalid_backend_in_breakdown(self, scorer):
|
||
|
|
"""Test that invalid backend raises error in get_task_breakdown."""
|
||
|
|
with pytest.raises(ValueError, match="Unknown backend"):
|
||
|
|
scorer.get_task_breakdown("invalid")
|
||
|
|
|
||
|
|
def test_zero_latency(self, scorer):
|
||
|
|
"""Test handling of zero latency."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="groq",
|
||
|
|
task_type=TaskType.FAST_OPS,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=0.0,
|
||
|
|
ttft_ms=0.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("groq", TaskType.FAST_OPS.value)
|
||
|
|
assert score.avg_latency_ms == 0.0
|
||
|
|
assert score.score > 50 # Should still have decent score
|
||
|
|
|
||
|
|
def test_very_high_latency(self, scorer):
|
||
|
|
"""Test handling of very high latency."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="openrouter",
|
||
|
|
task_type=TaskType.RESEARCH,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=50000.0, # 50 seconds
|
||
|
|
ttft_ms=5000.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("openrouter", TaskType.RESEARCH.value)
|
||
|
|
# Success rate is 100% but latency penalty brings it down
|
||
|
|
assert score.score < 85 # Should be penalized for high latency
|
||
|
|
|
||
|
|
def test_all_error_responses(self, scorer):
|
||
|
|
"""Test scoring when all responses are errors."""
|
||
|
|
for i in range(10):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="gemini",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.ERROR,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("gemini", TaskType.CODE.value)
|
||
|
|
# 0% success but perfect error/refusal/timeout rate = ~35
|
||
|
|
assert score.score < 45 # Should have low score
|
||
|
|
|
||
|
|
def test_all_refusal_responses(self, scorer):
|
||
|
|
"""Test scoring when all responses are refusals."""
|
||
|
|
for i in range(10):
|
||
|
|
scorer.record_response(
|
||
|
|
backend="gemini",
|
||
|
|
task_type=TaskType.CREATIVE,
|
||
|
|
status=ResponseStatus.REFUSAL,
|
||
|
|
latency_ms=500.0,
|
||
|
|
ttft_ms=100.0
|
||
|
|
)
|
||
|
|
|
||
|
|
score = scorer.get_backend_score("gemini", TaskType.CREATIVE.value)
|
||
|
|
assert score.refusal_count == 10
|
||
|
|
# 0% success, 0% error, 100% refusal, good latency = ~49
|
||
|
|
assert score.score < 55 # Should be low due to refusals
|
||
|
|
|
||
|
|
def test_metadata_storage(self, scorer):
|
||
|
|
"""Test that metadata is stored correctly."""
|
||
|
|
scorer.record_response(
|
||
|
|
backend="anthropic",
|
||
|
|
task_type=TaskType.CODE,
|
||
|
|
status=ResponseStatus.SUCCESS,
|
||
|
|
latency_ms=1000.0,
|
||
|
|
ttft_ms=200.0,
|
||
|
|
metadata={"model": "claude-3-opus", "region": "us-east-1"}
|
||
|
|
)
|
||
|
|
|
||
|
|
# Verify in database
|
||
|
|
conn = sqlite3.connect(str(scorer.db_path))
|
||
|
|
row = conn.execute("SELECT metadata FROM responses LIMIT 1").fetchone()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
import json
|
||
|
|
metadata = json.loads(row[0])
|
||
|
|
assert metadata["model"] == "claude-3-opus"
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
pytest.main([__file__, "-v"])
|