timmy-home/uniwizard/test_self_grader.py

#!/usr/bin/env python3
"""
Tests for the Self-Grader Module

Run with: python -m pytest test_self_grader.py -v
"""

import json
import sqlite3
import tempfile
from pathlib import Path
from datetime import datetime, timedelta
import pytest

from self_grader import SelfGrader, SessionGrade, WeeklyReport


class TestSessionGrade:
    """Tests for SessionGrade dataclass."""

    def test_session_grade_creation(self):
        """Test creating a SessionGrade."""
        grade = SessionGrade(
            session_id="test-123",
            session_file="session_test.json",
            graded_at=datetime.now().isoformat(),
            task_completed=True,
            tool_calls_efficient=4,
            response_quality=5,
            errors_recovered=True,
            total_api_calls=10,
            model="claude-opus",
            platform="cli",
            session_start=datetime.now().isoformat(),
            duration_seconds=120.0,
            task_summary="Test task",
            total_errors=0,
            error_types="[]",
            tools_with_errors="[]",
            had_repeated_errors=False,
            had_infinite_loop_risk=False,
            had_user_clarification=False
        )

        assert grade.session_id == "test-123"
        assert grade.task_completed is True
        assert grade.tool_calls_efficient == 4
        assert grade.response_quality == 5


class TestSelfGraderInit:
    """Tests for SelfGrader initialization."""

    def test_init_creates_database(self, tmp_path):
        """Test that initialization creates the database."""
        db_path = tmp_path / "grades.db"
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        grader = SelfGrader(grades_db_path=db_path, sessions_dir=sessions_dir)

        assert db_path.exists()

        # Check tables exist
        with sqlite3.connect(db_path) as conn:
            cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = {row[0] for row in cursor.fetchall()}

            assert "session_grades" in tables
            assert "weekly_reports" in tables


class TestErrorDetection:
    """Tests for error detection and classification."""

    def test_detect_exit_code_error(self, tmp_path):
        """Test detection of exit code errors."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        assert grader._detect_error('{"exit_code": 1, "output": ""}') is True
        assert grader._detect_error('{"exit_code": 0, "output": "success"}') is False
        assert grader._detect_error('') is False

    def test_detect_explicit_error(self, tmp_path):
        """Test detection of explicit error messages."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        assert grader._detect_error('{"error": "file not found"}') is True
        assert grader._detect_error('Traceback (most recent call last):') is True
        assert grader._detect_error('Command failed with exception') is True

    def test_classify_file_not_found(self, tmp_path):
        """Test classification of file not found errors."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        error = "Error: file '/path/to/file' not found"
        assert grader._classify_error(error) == "file_not_found"

    def test_classify_timeout(self, tmp_path):
        """Test classification of timeout errors."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        error = "Request timed out after 30 seconds"
        assert grader._classify_error(error) == "timeout"

    def test_classify_unknown(self, tmp_path):
        """Test classification of unknown errors."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        error = "Something weird happened"
        assert grader._classify_error(error) == "unknown"


class TestSessionAnalysis:
    """Tests for session analysis."""

    def test_analyze_empty_messages(self, tmp_path):
        """Test analysis of empty message list."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        analysis = grader._analyze_messages([])

        assert analysis['total_api_calls'] == 0
        assert analysis['total_errors'] == 0
        assert analysis['had_repeated_errors'] is False

    def test_analyze_simple_session(self, tmp_path):
        """Test analysis of a simple successful session."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        messages = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi there!"},
        ]

        analysis = grader._analyze_messages(messages)

        assert analysis['total_api_calls'] == 1
        assert analysis['total_errors'] == 0

    def test_analyze_session_with_errors(self, tmp_path):
        """Test analysis of a session with errors."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        messages = [
            {"role": "user", "content": "Run command"},
            {"role": "assistant", "content": "", "tool_calls": [
                {"function": {"name": "terminal"}}
            ]},
            {"role": "tool", "name": "terminal", "content": '{"exit_code": 1, "error": "failed"}'},
            {"role": "assistant", "content": "Let me try again", "tool_calls": [
                {"function": {"name": "terminal"}}
            ]},
            {"role": "tool", "name": "terminal", "content": '{"exit_code": 0, "output": "success"}'},
        ]

        analysis = grader._analyze_messages(messages)

        assert analysis['total_api_calls'] == 2
        assert analysis['total_errors'] == 1
        assert analysis['tools_with_errors'] == {"terminal"}

    def test_detect_repeated_errors(self, tmp_path):
        """Test detection of repeated errors pattern."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        messages = []
        for i in range(5):
            messages.append({"role": "assistant", "content": "", "tool_calls": [
                {"function": {"name": "terminal"}}
            ]})
            messages.append({"role": "tool", "name": "terminal",
                           "content": '{"exit_code": 1, "error": "failed"}'})

        analysis = grader._analyze_messages(messages)

        assert analysis['had_repeated_errors'] is True
        assert analysis['had_infinite_loop_risk'] is True


class TestGradingLogic:
    """Tests for grading logic."""

    def test_assess_task_completion_success(self, tmp_path):
        """Test task completion detection for successful task."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        messages = [
            {"role": "user", "content": "Create a file"},
            {"role": "assistant", "content": "Done! Created the file successfully."},
        ]

        analysis = grader._analyze_messages(messages)
        result = grader._assess_task_completion(messages, analysis)

        assert result is True

    def test_assess_tool_efficiency_perfect(self, tmp_path):
        """Test perfect tool efficiency score."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        analysis = {
            'total_api_calls': 5,
            'total_errors': 0
        }

        score = grader._assess_tool_efficiency(analysis)
        assert score == 5

    def test_assess_tool_efficiency_poor(self, tmp_path):
        """Test poor tool efficiency score."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        analysis = {
            'total_api_calls': 10,
            'total_errors': 5
        }

        score = grader._assess_tool_efficiency(analysis)
        assert score <= 2

    def test_assess_response_quality_high(self, tmp_path):
        """Test high response quality with good content."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        messages = [
            {"role": "assistant", "content": "Here's the solution:\n```python\nprint('hello')\n```\n" + "x" * 1000}
        ]

        analysis = {
            'final_assistant_msg': messages[0],
            'total_errors': 0,
            'had_repeated_errors': False,
            'had_infinite_loop_risk': False
        }

        score = grader._assess_response_quality(messages, analysis)
        assert score >= 4

    def test_error_recovery_success(self, tmp_path):
        """Test error recovery assessment - recovered."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        analysis = {
            'total_errors': 1,
            'had_repeated_errors': False
        }

        messages = [
            {"role": "assistant", "content": "Success after retry!"}
        ]

        result = grader._assess_error_recovery(messages, analysis)
        assert result is True


class TestSessionGrading:
    """Tests for full session grading."""

    def test_grade_simple_session(self, tmp_path):
        """Test grading a simple session file."""
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        # Create a test session file
        session_data = {
            "session_id": "test-session-1",
            "model": "test-model",
            "platform": "cli",
            "session_start": datetime.now().isoformat(),
            "message_count": 2,
            "messages": [
                {"role": "user", "content": "Hello, create a test file"},
                {"role": "assistant", "content": "Done! Created test.txt successfully."}
            ]
        }

        session_file = sessions_dir / "session_test-session-1.json"
        with open(session_file, 'w') as f:
            json.dump(session_data, f)

        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=sessions_dir
        )

        grade = grader.grade_session_file(session_file)

        assert grade is not None
        assert grade.session_id == "test-session-1"
        assert grade.task_completed is True
        assert grade.total_api_calls == 1

    def test_save_and_retrieve_grade(self, tmp_path):
        """Test saving and retrieving a grade."""
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=sessions_dir
        )

        grade = SessionGrade(
            session_id="test-save",
            session_file="test.json",
            graded_at=datetime.now().isoformat(),
            task_completed=True,
            tool_calls_efficient=4,
            response_quality=5,
            errors_recovered=True,
            total_api_calls=10,
            model="test-model",
            platform="cli",
            session_start=datetime.now().isoformat(),
            duration_seconds=60.0,
            task_summary="Test",
            total_errors=0,
            error_types="[]",
            tools_with_errors="[]",
            had_repeated_errors=False,
            had_infinite_loop_risk=False,
            had_user_clarification=False
        )

        result = grader.save_grade(grade)
        assert result is True

        # Verify in database
        with sqlite3.connect(tmp_path / "grades.db") as conn:
            cursor = conn.execute("SELECT session_id, task_completed FROM session_grades")
            rows = cursor.fetchall()

            assert len(rows) == 1
            assert rows[0][0] == "test-save"
            assert rows[0][1] == 1


class TestPatternIdentification:
    """Tests for pattern identification."""

    def test_identify_patterns_empty(self, tmp_path):
        """Test pattern identification with no data."""
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=sessions_dir
        )

        patterns = grader.identify_patterns(days=7)

        assert patterns['total_sessions'] == 0
        assert patterns['avg_tool_efficiency'] == 0

    def test_infer_task_type(self, tmp_path):
        """Test task type inference."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        assert grader._infer_task_type("Please review this code") == "code_review"
        assert grader._infer_task_type("Fix the bug in login") == "debugging"
        assert grader._infer_task_type("Add a new feature") == "feature_impl"
        assert grader._infer_task_type("Do something random") == "general"


class TestWeeklyReport:
    """Tests for weekly report generation."""

    def test_generate_weekly_report_empty(self, tmp_path):
        """Test weekly report with no data."""
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=sessions_dir
        )

        report = grader.generate_weekly_report()

        assert report.total_sessions == 0
        assert report.avg_tool_efficiency == 0
        assert len(report.improvement_suggestions) > 0

    def test_generate_suggestions(self, tmp_path):
        """Test suggestion generation."""
        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=tmp_path / "sessions"
        )

        patterns = {
            'completion_rate': 50,
            'avg_tool_efficiency': 2,
            'error_recovery_rate': 70
        }

        suggestions = grader._generate_suggestions(
            patterns,
            [('code_review', 2.0)],
            [('terminal', 5)],
            [('file_not_found', 3)]
        )

        assert len(suggestions) > 0
        assert any('completion rate' in s.lower() for s in suggestions)


class TestGradeLatestSessions:
    """Tests for grading latest sessions."""

    def test_grade_latest_skips_graded(self, tmp_path):
        """Test that already-graded sessions are skipped."""
        sessions_dir = tmp_path / "sessions"
        sessions_dir.mkdir()

        # Create session file
        session_data = {
            "session_id": "already-graded",
            "model": "test",
            "messages": [
                {"role": "user", "content": "Test"},
                {"role": "assistant", "content": "Done"}
            ]
        }

        session_file = sessions_dir / "session_already-graded.json"
        with open(session_file, 'w') as f:
            json.dump(session_data, f)

        grader = SelfGrader(
            grades_db_path=tmp_path / "grades.db",
            sessions_dir=sessions_dir
        )

        # First grading
        grades1 = grader.grade_latest_sessions(n=10)
        assert len(grades1) == 1

        # Second grading should skip
        grades2 = grader.grade_latest_sessions(n=10)
        assert len(grades2) == 0


def test_main_cli():
    """Test CLI main function exists."""
    from self_grader import main
    assert callable(main)


if __name__ == '__main__':
    pytest.main([__file__, '-v'])