Files
timmy-home/uniwizard/test_self_grader.py

499 lines
16 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Tests for the Self-Grader Module
Run with: python -m pytest test_self_grader.py -v
"""
import json
import sqlite3
import tempfile
from pathlib import Path
from datetime import datetime, timedelta
import pytest
from self_grader import SelfGrader, SessionGrade, WeeklyReport
class TestSessionGrade:
"""Tests for SessionGrade dataclass."""
def test_session_grade_creation(self):
"""Test creating a SessionGrade."""
grade = SessionGrade(
session_id="test-123",
session_file="session_test.json",
graded_at=datetime.now().isoformat(),
task_completed=True,
tool_calls_efficient=4,
response_quality=5,
errors_recovered=True,
total_api_calls=10,
model="claude-opus",
platform="cli",
session_start=datetime.now().isoformat(),
duration_seconds=120.0,
task_summary="Test task",
total_errors=0,
error_types="[]",
tools_with_errors="[]",
had_repeated_errors=False,
had_infinite_loop_risk=False,
had_user_clarification=False
)
assert grade.session_id == "test-123"
assert grade.task_completed is True
assert grade.tool_calls_efficient == 4
assert grade.response_quality == 5
class TestSelfGraderInit:
"""Tests for SelfGrader initialization."""
def test_init_creates_database(self, tmp_path):
"""Test that initialization creates the database."""
db_path = tmp_path / "grades.db"
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
grader = SelfGrader(grades_db_path=db_path, sessions_dir=sessions_dir)
assert db_path.exists()
# Check tables exist
with sqlite3.connect(db_path) as conn:
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = {row[0] for row in cursor.fetchall()}
assert "session_grades" in tables
assert "weekly_reports" in tables
class TestErrorDetection:
"""Tests for error detection and classification."""
def test_detect_exit_code_error(self, tmp_path):
"""Test detection of exit code errors."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
assert grader._detect_error('{"exit_code": 1, "output": ""}') is True
assert grader._detect_error('{"exit_code": 0, "output": "success"}') is False
assert grader._detect_error('') is False
def test_detect_explicit_error(self, tmp_path):
"""Test detection of explicit error messages."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
assert grader._detect_error('{"error": "file not found"}') is True
assert grader._detect_error('Traceback (most recent call last):') is True
assert grader._detect_error('Command failed with exception') is True
def test_classify_file_not_found(self, tmp_path):
"""Test classification of file not found errors."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
error = "Error: file '/path/to/file' not found"
assert grader._classify_error(error) == "file_not_found"
def test_classify_timeout(self, tmp_path):
"""Test classification of timeout errors."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
error = "Request timed out after 30 seconds"
assert grader._classify_error(error) == "timeout"
def test_classify_unknown(self, tmp_path):
"""Test classification of unknown errors."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
error = "Something weird happened"
assert grader._classify_error(error) == "unknown"
class TestSessionAnalysis:
"""Tests for session analysis."""
def test_analyze_empty_messages(self, tmp_path):
"""Test analysis of empty message list."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
analysis = grader._analyze_messages([])
assert analysis['total_api_calls'] == 0
assert analysis['total_errors'] == 0
assert analysis['had_repeated_errors'] is False
def test_analyze_simple_session(self, tmp_path):
"""Test analysis of a simple successful session."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
]
analysis = grader._analyze_messages(messages)
assert analysis['total_api_calls'] == 1
assert analysis['total_errors'] == 0
def test_analyze_session_with_errors(self, tmp_path):
"""Test analysis of a session with errors."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
messages = [
{"role": "user", "content": "Run command"},
{"role": "assistant", "content": "", "tool_calls": [
{"function": {"name": "terminal"}}
]},
{"role": "tool", "name": "terminal", "content": '{"exit_code": 1, "error": "failed"}'},
{"role": "assistant", "content": "Let me try again", "tool_calls": [
{"function": {"name": "terminal"}}
]},
{"role": "tool", "name": "terminal", "content": '{"exit_code": 0, "output": "success"}'},
]
analysis = grader._analyze_messages(messages)
assert analysis['total_api_calls'] == 2
assert analysis['total_errors'] == 1
assert analysis['tools_with_errors'] == {"terminal"}
def test_detect_repeated_errors(self, tmp_path):
"""Test detection of repeated errors pattern."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
messages = []
for i in range(5):
messages.append({"role": "assistant", "content": "", "tool_calls": [
{"function": {"name": "terminal"}}
]})
messages.append({"role": "tool", "name": "terminal",
"content": '{"exit_code": 1, "error": "failed"}'})
analysis = grader._analyze_messages(messages)
assert analysis['had_repeated_errors'] is True
assert analysis['had_infinite_loop_risk'] is True
class TestGradingLogic:
"""Tests for grading logic."""
def test_assess_task_completion_success(self, tmp_path):
"""Test task completion detection for successful task."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
messages = [
{"role": "user", "content": "Create a file"},
{"role": "assistant", "content": "Done! Created the file successfully."},
]
analysis = grader._analyze_messages(messages)
result = grader._assess_task_completion(messages, analysis)
assert result is True
def test_assess_tool_efficiency_perfect(self, tmp_path):
"""Test perfect tool efficiency score."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
analysis = {
'total_api_calls': 5,
'total_errors': 0
}
score = grader._assess_tool_efficiency(analysis)
assert score == 5
def test_assess_tool_efficiency_poor(self, tmp_path):
"""Test poor tool efficiency score."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
analysis = {
'total_api_calls': 10,
'total_errors': 5
}
score = grader._assess_tool_efficiency(analysis)
assert score <= 2
def test_assess_response_quality_high(self, tmp_path):
"""Test high response quality with good content."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
messages = [
{"role": "assistant", "content": "Here's the solution:\n```python\nprint('hello')\n```\n" + "x" * 1000}
]
analysis = {
'final_assistant_msg': messages[0],
'total_errors': 0,
'had_repeated_errors': False,
'had_infinite_loop_risk': False
}
score = grader._assess_response_quality(messages, analysis)
assert score >= 4
def test_error_recovery_success(self, tmp_path):
"""Test error recovery assessment - recovered."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
analysis = {
'total_errors': 1,
'had_repeated_errors': False
}
messages = [
{"role": "assistant", "content": "Success after retry!"}
]
result = grader._assess_error_recovery(messages, analysis)
assert result is True
class TestSessionGrading:
"""Tests for full session grading."""
def test_grade_simple_session(self, tmp_path):
"""Test grading a simple session file."""
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
# Create a test session file
session_data = {
"session_id": "test-session-1",
"model": "test-model",
"platform": "cli",
"session_start": datetime.now().isoformat(),
"message_count": 2,
"messages": [
{"role": "user", "content": "Hello, create a test file"},
{"role": "assistant", "content": "Done! Created test.txt successfully."}
]
}
session_file = sessions_dir / "session_test-session-1.json"
with open(session_file, 'w') as f:
json.dump(session_data, f)
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=sessions_dir
)
grade = grader.grade_session_file(session_file)
assert grade is not None
assert grade.session_id == "test-session-1"
assert grade.task_completed is True
assert grade.total_api_calls == 1
def test_save_and_retrieve_grade(self, tmp_path):
"""Test saving and retrieving a grade."""
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=sessions_dir
)
grade = SessionGrade(
session_id="test-save",
session_file="test.json",
graded_at=datetime.now().isoformat(),
task_completed=True,
tool_calls_efficient=4,
response_quality=5,
errors_recovered=True,
total_api_calls=10,
model="test-model",
platform="cli",
session_start=datetime.now().isoformat(),
duration_seconds=60.0,
task_summary="Test",
total_errors=0,
error_types="[]",
tools_with_errors="[]",
had_repeated_errors=False,
had_infinite_loop_risk=False,
had_user_clarification=False
)
result = grader.save_grade(grade)
assert result is True
# Verify in database
with sqlite3.connect(tmp_path / "grades.db") as conn:
cursor = conn.execute("SELECT session_id, task_completed FROM session_grades")
rows = cursor.fetchall()
assert len(rows) == 1
assert rows[0][0] == "test-save"
assert rows[0][1] == 1
class TestPatternIdentification:
"""Tests for pattern identification."""
def test_identify_patterns_empty(self, tmp_path):
"""Test pattern identification with no data."""
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=sessions_dir
)
patterns = grader.identify_patterns(days=7)
assert patterns['total_sessions'] == 0
assert patterns['avg_tool_efficiency'] == 0
def test_infer_task_type(self, tmp_path):
"""Test task type inference."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
assert grader._infer_task_type("Please review this code") == "code_review"
assert grader._infer_task_type("Fix the bug in login") == "debugging"
assert grader._infer_task_type("Add a new feature") == "feature_impl"
assert grader._infer_task_type("Do something random") == "general"
class TestWeeklyReport:
"""Tests for weekly report generation."""
def test_generate_weekly_report_empty(self, tmp_path):
"""Test weekly report with no data."""
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=sessions_dir
)
report = grader.generate_weekly_report()
assert report.total_sessions == 0
assert report.avg_tool_efficiency == 0
assert len(report.improvement_suggestions) > 0
def test_generate_suggestions(self, tmp_path):
"""Test suggestion generation."""
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=tmp_path / "sessions"
)
patterns = {
'completion_rate': 50,
'avg_tool_efficiency': 2,
'error_recovery_rate': 70
}
suggestions = grader._generate_suggestions(
patterns,
[('code_review', 2.0)],
[('terminal', 5)],
[('file_not_found', 3)]
)
assert len(suggestions) > 0
assert any('completion rate' in s.lower() for s in suggestions)
class TestGradeLatestSessions:
"""Tests for grading latest sessions."""
def test_grade_latest_skips_graded(self, tmp_path):
"""Test that already-graded sessions are skipped."""
sessions_dir = tmp_path / "sessions"
sessions_dir.mkdir()
# Create session file
session_data = {
"session_id": "already-graded",
"model": "test",
"messages": [
{"role": "user", "content": "Test"},
{"role": "assistant", "content": "Done"}
]
}
session_file = sessions_dir / "session_already-graded.json"
with open(session_file, 'w') as f:
json.dump(session_data, f)
grader = SelfGrader(
grades_db_path=tmp_path / "grades.db",
sessions_dir=sessions_dir
)
# First grading
grades1 = grader.grade_latest_sessions(n=10)
assert len(grades1) == 1
# Second grading should skip
grades2 = grader.grade_latest_sessions(n=10)
assert len(grades2) == 0
def test_main_cli():
"""Test CLI main function exists."""
from self_grader import main
assert callable(main)
if __name__ == '__main__':
pytest.main([__file__, '-v'])