Co-authored-by: Kimi Claw <kimi@timmytime.ai> Co-committed-by: Kimi Claw <kimi@timmytime.ai>
499 lines
16 KiB
Python
499 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests for the Self-Grader Module
|
|
|
|
Run with: python -m pytest test_self_grader.py -v
|
|
"""
|
|
|
|
import json
|
|
import sqlite3
|
|
import tempfile
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import pytest
|
|
|
|
from self_grader import SelfGrader, SessionGrade, WeeklyReport
|
|
|
|
|
|
class TestSessionGrade:
|
|
"""Tests for SessionGrade dataclass."""
|
|
|
|
def test_session_grade_creation(self):
|
|
"""Test creating a SessionGrade."""
|
|
grade = SessionGrade(
|
|
session_id="test-123",
|
|
session_file="session_test.json",
|
|
graded_at=datetime.now().isoformat(),
|
|
task_completed=True,
|
|
tool_calls_efficient=4,
|
|
response_quality=5,
|
|
errors_recovered=True,
|
|
total_api_calls=10,
|
|
model="claude-opus",
|
|
platform="cli",
|
|
session_start=datetime.now().isoformat(),
|
|
duration_seconds=120.0,
|
|
task_summary="Test task",
|
|
total_errors=0,
|
|
error_types="[]",
|
|
tools_with_errors="[]",
|
|
had_repeated_errors=False,
|
|
had_infinite_loop_risk=False,
|
|
had_user_clarification=False
|
|
)
|
|
|
|
assert grade.session_id == "test-123"
|
|
assert grade.task_completed is True
|
|
assert grade.tool_calls_efficient == 4
|
|
assert grade.response_quality == 5
|
|
|
|
|
|
class TestSelfGraderInit:
|
|
"""Tests for SelfGrader initialization."""
|
|
|
|
def test_init_creates_database(self, tmp_path):
|
|
"""Test that initialization creates the database."""
|
|
db_path = tmp_path / "grades.db"
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
grader = SelfGrader(grades_db_path=db_path, sessions_dir=sessions_dir)
|
|
|
|
assert db_path.exists()
|
|
|
|
# Check tables exist
|
|
with sqlite3.connect(db_path) as conn:
|
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
tables = {row[0] for row in cursor.fetchall()}
|
|
|
|
assert "session_grades" in tables
|
|
assert "weekly_reports" in tables
|
|
|
|
|
|
class TestErrorDetection:
|
|
"""Tests for error detection and classification."""
|
|
|
|
def test_detect_exit_code_error(self, tmp_path):
|
|
"""Test detection of exit code errors."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
assert grader._detect_error('{"exit_code": 1, "output": ""}') is True
|
|
assert grader._detect_error('{"exit_code": 0, "output": "success"}') is False
|
|
assert grader._detect_error('') is False
|
|
|
|
def test_detect_explicit_error(self, tmp_path):
|
|
"""Test detection of explicit error messages."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
assert grader._detect_error('{"error": "file not found"}') is True
|
|
assert grader._detect_error('Traceback (most recent call last):') is True
|
|
assert grader._detect_error('Command failed with exception') is True
|
|
|
|
def test_classify_file_not_found(self, tmp_path):
|
|
"""Test classification of file not found errors."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
error = "Error: file '/path/to/file' not found"
|
|
assert grader._classify_error(error) == "file_not_found"
|
|
|
|
def test_classify_timeout(self, tmp_path):
|
|
"""Test classification of timeout errors."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
error = "Request timed out after 30 seconds"
|
|
assert grader._classify_error(error) == "timeout"
|
|
|
|
def test_classify_unknown(self, tmp_path):
|
|
"""Test classification of unknown errors."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
error = "Something weird happened"
|
|
assert grader._classify_error(error) == "unknown"
|
|
|
|
|
|
class TestSessionAnalysis:
|
|
"""Tests for session analysis."""
|
|
|
|
def test_analyze_empty_messages(self, tmp_path):
|
|
"""Test analysis of empty message list."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
analysis = grader._analyze_messages([])
|
|
|
|
assert analysis['total_api_calls'] == 0
|
|
assert analysis['total_errors'] == 0
|
|
assert analysis['had_repeated_errors'] is False
|
|
|
|
def test_analyze_simple_session(self, tmp_path):
|
|
"""Test analysis of a simple successful session."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
]
|
|
|
|
analysis = grader._analyze_messages(messages)
|
|
|
|
assert analysis['total_api_calls'] == 1
|
|
assert analysis['total_errors'] == 0
|
|
|
|
def test_analyze_session_with_errors(self, tmp_path):
|
|
"""Test analysis of a session with errors."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "Run command"},
|
|
{"role": "assistant", "content": "", "tool_calls": [
|
|
{"function": {"name": "terminal"}}
|
|
]},
|
|
{"role": "tool", "name": "terminal", "content": '{"exit_code": 1, "error": "failed"}'},
|
|
{"role": "assistant", "content": "Let me try again", "tool_calls": [
|
|
{"function": {"name": "terminal"}}
|
|
]},
|
|
{"role": "tool", "name": "terminal", "content": '{"exit_code": 0, "output": "success"}'},
|
|
]
|
|
|
|
analysis = grader._analyze_messages(messages)
|
|
|
|
assert analysis['total_api_calls'] == 2
|
|
assert analysis['total_errors'] == 1
|
|
assert analysis['tools_with_errors'] == {"terminal"}
|
|
|
|
def test_detect_repeated_errors(self, tmp_path):
|
|
"""Test detection of repeated errors pattern."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
messages = []
|
|
for i in range(5):
|
|
messages.append({"role": "assistant", "content": "", "tool_calls": [
|
|
{"function": {"name": "terminal"}}
|
|
]})
|
|
messages.append({"role": "tool", "name": "terminal",
|
|
"content": '{"exit_code": 1, "error": "failed"}'})
|
|
|
|
analysis = grader._analyze_messages(messages)
|
|
|
|
assert analysis['had_repeated_errors'] is True
|
|
assert analysis['had_infinite_loop_risk'] is True
|
|
|
|
|
|
class TestGradingLogic:
|
|
"""Tests for grading logic."""
|
|
|
|
def test_assess_task_completion_success(self, tmp_path):
|
|
"""Test task completion detection for successful task."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
messages = [
|
|
{"role": "user", "content": "Create a file"},
|
|
{"role": "assistant", "content": "Done! Created the file successfully."},
|
|
]
|
|
|
|
analysis = grader._analyze_messages(messages)
|
|
result = grader._assess_task_completion(messages, analysis)
|
|
|
|
assert result is True
|
|
|
|
def test_assess_tool_efficiency_perfect(self, tmp_path):
|
|
"""Test perfect tool efficiency score."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
analysis = {
|
|
'total_api_calls': 5,
|
|
'total_errors': 0
|
|
}
|
|
|
|
score = grader._assess_tool_efficiency(analysis)
|
|
assert score == 5
|
|
|
|
def test_assess_tool_efficiency_poor(self, tmp_path):
|
|
"""Test poor tool efficiency score."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
analysis = {
|
|
'total_api_calls': 10,
|
|
'total_errors': 5
|
|
}
|
|
|
|
score = grader._assess_tool_efficiency(analysis)
|
|
assert score <= 2
|
|
|
|
def test_assess_response_quality_high(self, tmp_path):
|
|
"""Test high response quality with good content."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
messages = [
|
|
{"role": "assistant", "content": "Here's the solution:\n```python\nprint('hello')\n```\n" + "x" * 1000}
|
|
]
|
|
|
|
analysis = {
|
|
'final_assistant_msg': messages[0],
|
|
'total_errors': 0,
|
|
'had_repeated_errors': False,
|
|
'had_infinite_loop_risk': False
|
|
}
|
|
|
|
score = grader._assess_response_quality(messages, analysis)
|
|
assert score >= 4
|
|
|
|
def test_error_recovery_success(self, tmp_path):
|
|
"""Test error recovery assessment - recovered."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
analysis = {
|
|
'total_errors': 1,
|
|
'had_repeated_errors': False
|
|
}
|
|
|
|
messages = [
|
|
{"role": "assistant", "content": "Success after retry!"}
|
|
]
|
|
|
|
result = grader._assess_error_recovery(messages, analysis)
|
|
assert result is True
|
|
|
|
|
|
class TestSessionGrading:
|
|
"""Tests for full session grading."""
|
|
|
|
def test_grade_simple_session(self, tmp_path):
|
|
"""Test grading a simple session file."""
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
# Create a test session file
|
|
session_data = {
|
|
"session_id": "test-session-1",
|
|
"model": "test-model",
|
|
"platform": "cli",
|
|
"session_start": datetime.now().isoformat(),
|
|
"message_count": 2,
|
|
"messages": [
|
|
{"role": "user", "content": "Hello, create a test file"},
|
|
{"role": "assistant", "content": "Done! Created test.txt successfully."}
|
|
]
|
|
}
|
|
|
|
session_file = sessions_dir / "session_test-session-1.json"
|
|
with open(session_file, 'w') as f:
|
|
json.dump(session_data, f)
|
|
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=sessions_dir
|
|
)
|
|
|
|
grade = grader.grade_session_file(session_file)
|
|
|
|
assert grade is not None
|
|
assert grade.session_id == "test-session-1"
|
|
assert grade.task_completed is True
|
|
assert grade.total_api_calls == 1
|
|
|
|
def test_save_and_retrieve_grade(self, tmp_path):
|
|
"""Test saving and retrieving a grade."""
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=sessions_dir
|
|
)
|
|
|
|
grade = SessionGrade(
|
|
session_id="test-save",
|
|
session_file="test.json",
|
|
graded_at=datetime.now().isoformat(),
|
|
task_completed=True,
|
|
tool_calls_efficient=4,
|
|
response_quality=5,
|
|
errors_recovered=True,
|
|
total_api_calls=10,
|
|
model="test-model",
|
|
platform="cli",
|
|
session_start=datetime.now().isoformat(),
|
|
duration_seconds=60.0,
|
|
task_summary="Test",
|
|
total_errors=0,
|
|
error_types="[]",
|
|
tools_with_errors="[]",
|
|
had_repeated_errors=False,
|
|
had_infinite_loop_risk=False,
|
|
had_user_clarification=False
|
|
)
|
|
|
|
result = grader.save_grade(grade)
|
|
assert result is True
|
|
|
|
# Verify in database
|
|
with sqlite3.connect(tmp_path / "grades.db") as conn:
|
|
cursor = conn.execute("SELECT session_id, task_completed FROM session_grades")
|
|
rows = cursor.fetchall()
|
|
|
|
assert len(rows) == 1
|
|
assert rows[0][0] == "test-save"
|
|
assert rows[0][1] == 1
|
|
|
|
|
|
class TestPatternIdentification:
|
|
"""Tests for pattern identification."""
|
|
|
|
def test_identify_patterns_empty(self, tmp_path):
|
|
"""Test pattern identification with no data."""
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=sessions_dir
|
|
)
|
|
|
|
patterns = grader.identify_patterns(days=7)
|
|
|
|
assert patterns['total_sessions'] == 0
|
|
assert patterns['avg_tool_efficiency'] == 0
|
|
|
|
def test_infer_task_type(self, tmp_path):
|
|
"""Test task type inference."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
assert grader._infer_task_type("Please review this code") == "code_review"
|
|
assert grader._infer_task_type("Fix the bug in login") == "debugging"
|
|
assert grader._infer_task_type("Add a new feature") == "feature_impl"
|
|
assert grader._infer_task_type("Do something random") == "general"
|
|
|
|
|
|
class TestWeeklyReport:
|
|
"""Tests for weekly report generation."""
|
|
|
|
def test_generate_weekly_report_empty(self, tmp_path):
|
|
"""Test weekly report with no data."""
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=sessions_dir
|
|
)
|
|
|
|
report = grader.generate_weekly_report()
|
|
|
|
assert report.total_sessions == 0
|
|
assert report.avg_tool_efficiency == 0
|
|
assert len(report.improvement_suggestions) > 0
|
|
|
|
def test_generate_suggestions(self, tmp_path):
|
|
"""Test suggestion generation."""
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=tmp_path / "sessions"
|
|
)
|
|
|
|
patterns = {
|
|
'completion_rate': 50,
|
|
'avg_tool_efficiency': 2,
|
|
'error_recovery_rate': 70
|
|
}
|
|
|
|
suggestions = grader._generate_suggestions(
|
|
patterns,
|
|
[('code_review', 2.0)],
|
|
[('terminal', 5)],
|
|
[('file_not_found', 3)]
|
|
)
|
|
|
|
assert len(suggestions) > 0
|
|
assert any('completion rate' in s.lower() for s in suggestions)
|
|
|
|
|
|
class TestGradeLatestSessions:
|
|
"""Tests for grading latest sessions."""
|
|
|
|
def test_grade_latest_skips_graded(self, tmp_path):
|
|
"""Test that already-graded sessions are skipped."""
|
|
sessions_dir = tmp_path / "sessions"
|
|
sessions_dir.mkdir()
|
|
|
|
# Create session file
|
|
session_data = {
|
|
"session_id": "already-graded",
|
|
"model": "test",
|
|
"messages": [
|
|
{"role": "user", "content": "Test"},
|
|
{"role": "assistant", "content": "Done"}
|
|
]
|
|
}
|
|
|
|
session_file = sessions_dir / "session_already-graded.json"
|
|
with open(session_file, 'w') as f:
|
|
json.dump(session_data, f)
|
|
|
|
grader = SelfGrader(
|
|
grades_db_path=tmp_path / "grades.db",
|
|
sessions_dir=sessions_dir
|
|
)
|
|
|
|
# First grading
|
|
grades1 = grader.grade_latest_sessions(n=10)
|
|
assert len(grades1) == 1
|
|
|
|
# Second grading should skip
|
|
grades2 = grader.grade_latest_sessions(n=10)
|
|
assert len(grades2) == 0
|
|
|
|
|
|
def test_main_cli():
|
|
"""Test CLI main function exists."""
|
|
from self_grader import main
|
|
assert callable(main)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|