Timmy-time-dashboard/tests/unit/test_retrain_loop.py

"""Unit tests for the AutoLoRA continuous improvement loop.

Covers trajectory extraction, quality filtering, dataset management,
and the retrain orchestrator.

Refs: #1105
"""

from __future__ import annotations

import json
import tempfile
from datetime import UTC, datetime, timedelta
from pathlib import Path

import pytest

from timmy_automations.retrain.quality_filter import QualityFilter, TrajectoryQuality
from timmy_automations.retrain.retrain import RetrainOrchestrator
from timmy_automations.retrain.training_dataset import TrainingDataset
from timmy_automations.retrain.training_log import CycleMetrics, TrainingLog
from timmy_automations.retrain.trajectory_exporter import Trajectory, TrajectoryExporter


# ── Fixtures ─────────────────────────────────────────────────────────────────


def _ts(offset_minutes: int = 0) -> str:
    """Return an ISO timestamp offset from now."""
    return (datetime.now(tz=UTC) + timedelta(minutes=offset_minutes)).isoformat()


def _make_session_log(entries: list[dict], date_str: str, tmp_path: Path) -> Path:
    """Write session JSONL entries to a temp log file."""
    log_dir = tmp_path / "logs"
    log_dir.mkdir(parents=True, exist_ok=True)
    log_file = log_dir / f"session_{date_str}.jsonl"
    with open(log_file, "w") as f:
        for entry in entries:
            f.write(json.dumps(entry) + "\n")
    return log_file


def _user_msg(content: str, offset: int = 0) -> dict:
    return {"type": "message", "role": "user", "content": content, "timestamp": _ts(offset)}


def _timmy_msg(content: str, confidence: float | None = None, offset: int = 0) -> dict:
    entry = {"type": "message", "role": "timmy", "content": content, "timestamp": _ts(offset)}
    if confidence is not None:
        entry["confidence"] = confidence
    return entry


def _tool_call(tool: str = "bash", result: str = "ok", offset: int = 0) -> dict:
    return {
        "type": "tool_call",
        "tool": tool,
        "args": {},
        "result": result,
        "timestamp": _ts(offset),
    }


def _error_entry(msg: str = "Something failed", offset: int = 0) -> dict:
    return {"type": "error", "error": msg, "timestamp": _ts(offset)}


def _decision_entry(decision: str = "Use approach A", offset: int = 0) -> dict:
    return {"type": "decision", "decision": decision, "timestamp": _ts(offset)}


# ── Trajectory dataclass tests ────────────────────────────────────────────────


class TestTrajectory:
    def test_message_count(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("hi"), _timmy_msg("hello")],
        )
        assert t.message_count == 2

    def test_tool_call_count(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            tool_calls=[_tool_call(), _tool_call()],
        )
        assert t.tool_call_count == 2

    def test_has_successful_tool_call_when_no_errors(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            tool_calls=[_tool_call()],
            errors=[],
        )
        assert t.has_successful_tool_call is True

    def test_has_successful_tool_call_false_when_errors(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            tool_calls=[_tool_call()],
            errors=[_error_entry()],
        )
        assert t.has_successful_tool_call is False

    def test_is_multi_step(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("do it"), _timmy_msg("done")],
            tool_calls=[_tool_call()],
        )
        assert t.is_multi_step is True

    def test_is_not_multi_step_single_message(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_timmy_msg("hello")],
            tool_calls=[],
        )
        assert t.is_multi_step is False

    def test_to_chat_format_ordering(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("question", offset=0), _timmy_msg("answer", offset=2)],
            tool_calls=[_tool_call(offset=1)],
        )
        chat = t.to_chat_format()
        roles = [m["role"] for m in chat]
        assert "user" in roles
        assert "assistant" in roles

    def test_to_chat_format_empty_content_skipped(self):
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg(""), _timmy_msg("response")],
        )
        chat = t.to_chat_format()
        # Empty user message should be skipped
        assert all(m["content"] for m in chat)


# ── TrajectoryExporter tests ──────────────────────────────────────────────────


class TestTrajectoryExporter:
    def test_export_empty_logs_dir(self, tmp_path):
        (tmp_path / "logs").mkdir()
        exporter = TrajectoryExporter(logs_dir=tmp_path / "logs", repo_root=tmp_path)
        result = exporter.export_week(weeks_ago=0)
        assert result == []

    def test_export_reads_session_files(self, tmp_path):
        # Write a session file for this week
        today = datetime.now(tz=UTC)
        date_str = today.strftime("%Y-%m-%d")
        entries = [
            _user_msg("tell me about Python"),
            _timmy_msg("Python is great"),
        ]
        _make_session_log(entries, date_str, tmp_path)

        exporter = TrajectoryExporter(logs_dir=tmp_path / "logs", repo_root=tmp_path)
        result = exporter.export_week(weeks_ago=0)
        assert len(result) >= 1

    def test_export_skips_old_sessions(self, tmp_path):
        # Write a session file for 3 weeks ago
        three_weeks_ago = datetime.now(tz=UTC) - timedelta(weeks=3)
        date_str = three_weeks_ago.strftime("%Y-%m-%d")
        entries = [_user_msg("old message"), _timmy_msg("old response")]
        _make_session_log(entries, date_str, tmp_path)

        exporter = TrajectoryExporter(logs_dir=tmp_path / "logs", repo_root=tmp_path)
        # Request current week — should not include 3-week-old data
        result = exporter.export_week(weeks_ago=0)
        assert result == []

    def test_export_segments_by_gap(self, tmp_path):
        today = datetime.now(tz=UTC)
        date_str = today.strftime("%Y-%m-%d")

        # Two conversations separated by 10 minutes
        t1 = (today - timedelta(minutes=15)).isoformat()
        t2 = (today - timedelta(minutes=14)).isoformat()
        t3 = (today - timedelta(minutes=2)).isoformat()
        t4 = (today - timedelta(minutes=1)).isoformat()

        entries = [
            {"type": "message", "role": "user", "content": "first q", "timestamp": t1},
            {"type": "message", "role": "timmy", "content": "first a", "timestamp": t2},
            {"type": "message", "role": "user", "content": "second q", "timestamp": t3},
            {"type": "message", "role": "timmy", "content": "second a", "timestamp": t4},
        ]
        _make_session_log(entries, date_str, tmp_path)

        exporter = TrajectoryExporter(logs_dir=tmp_path / "logs", repo_root=tmp_path)
        result = exporter.export_week(weeks_ago=0)
        # Should have at least 1 trajectory (may be 1 or 2 depending on segmentation)
        assert len(result) >= 1

    def test_handles_malformed_log_file(self, tmp_path):
        log_dir = tmp_path / "logs"
        log_dir.mkdir()
        today = datetime.now(tz=UTC).strftime("%Y-%m-%d")
        (log_dir / f"session_{today}.jsonl").write_text("not json\n{}\n")

        exporter = TrajectoryExporter(logs_dir=log_dir, repo_root=tmp_path)
        # Should not raise, just return empty or partial results
        result = exporter.export_week(weeks_ago=0)
        assert isinstance(result, list)


# ── QualityFilter tests ───────────────────────────────────────────────────────


class TestQualityFilter:
    def _make_high_quality(self) -> Trajectory:
        return Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("do task"), _timmy_msg("done", confidence=0.9)],
            tool_calls=[_tool_call(), _tool_call()],
            errors=[],
            decisions=[_decision_entry()],
        )

    def _make_medium_quality(self) -> Trajectory:
        return Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("hello"), _timmy_msg("hi")],
            tool_calls=[],
            errors=[],
        )

    def _make_low_quality(self) -> Trajectory:
        return Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_timmy_msg("oops")],  # No user message
            errors=[_error_entry()],
        )

    def test_high_quality_classification(self):
        qf = QualityFilter()
        result = qf.assess(self._make_high_quality())
        assert result.quality == TrajectoryQuality.HIGH
        assert result.score >= 4.0
        assert result.is_trainable

    def test_medium_quality_classification(self):
        qf = QualityFilter()
        result = qf.assess(self._make_medium_quality())
        assert result.quality == TrajectoryQuality.MEDIUM
        assert result.is_trainable

    def test_low_quality_no_user_message(self):
        qf = QualityFilter()
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_timmy_msg("random")],
        )
        result = qf.assess(t)
        assert result.quality == TrajectoryQuality.LOW
        assert not result.is_trainable

    def test_error_penalizes_score(self):
        qf = QualityFilter()
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("go"), _timmy_msg("fail")],
            tool_calls=[_tool_call()],
            errors=[_error_entry(), _error_entry()],
        )
        result = qf.assess(t)
        assert result.score < qf.assess(self._make_high_quality()).score

    def test_low_confidence_penalizes_score(self):
        qf = QualityFilter()
        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(),
            ended_at=_ts(),
            messages=[_user_msg("q"), _timmy_msg("a", confidence=0.2)],
        )
        result = qf.assess(t)
        assert result.score < 1.0

    def test_filter_returns_stats(self):
        qf = QualityFilter()
        trajectories = [
            self._make_high_quality(),
            self._make_medium_quality(),
            self._make_low_quality(),
        ]
        trainable, stats = qf.filter(trajectories)
        assert stats["total"] == 3
        assert stats["accepted"] == len(trainable)
        assert stats["high"] + stats["medium"] + stats["low"] == 3

    def test_filter_empty_list(self):
        qf = QualityFilter()
        trainable, stats = qf.filter([])
        assert trainable == []
        assert stats["total"] == 0
        assert stats["accepted"] == 0


# ── TrainingDataset tests ─────────────────────────────────────────────────────


class TestTrainingDataset:
    def _make_result(self, quality=TrajectoryQuality.HIGH, score=5.0) -> object:
        from timmy_automations.retrain.quality_filter import QualityResult

        t = Trajectory(
            session_date="2026-03-17",
            started_at=_ts(-5),
            ended_at=_ts(),
            messages=[_user_msg("do it"), _timmy_msg("done")],
            tool_calls=[_tool_call()],
        )
        return QualityResult(trajectory=t, quality=quality, score=score, reasons=[])

    def test_count_empty_dataset(self, tmp_path):
        ds = TrainingDataset(
            dataset_path=".loop/retrain/training_data.jsonl",
            repo_root=tmp_path,
        )
        assert ds.count() == 0

    def test_append_adds_examples(self, tmp_path):
        ds = TrainingDataset(repo_root=tmp_path)
        result = ds.append([self._make_result()], "2026-W12")
        assert result.new_examples == 1
        assert result.total_examples == 1
        assert ds.count() == 1

    def test_append_idempotent(self, tmp_path):
        ds = TrainingDataset(repo_root=tmp_path)
        r = self._make_result()
        ds.append([r], "2026-W12")
        result2 = ds.append([r], "2026-W12")
        # Same trajectory shouldn't be added twice
        assert result2.new_examples == 0
        assert ds.count() == 1

    def test_append_different_weeks(self, tmp_path):
        ds = TrainingDataset(repo_root=tmp_path)
        r1 = self._make_result()
        ds.append([r1], "2026-W11")
        ds.append([r1], "2026-W12")
        # Different week tags = different records
        assert ds.count() == 2

    def test_dataset_file_is_valid_jsonl(self, tmp_path):
        ds = TrainingDataset(repo_root=tmp_path)
        ds.append([self._make_result()], "2026-W12")
        with open(ds.dataset_path) as f:
            lines = [l.strip() for l in f if l.strip()]
        assert len(lines) == 1
        record = json.loads(lines[0])
        assert "messages" in record
        assert "week" in record
        assert "quality" in record

    def test_index_updated_after_append(self, tmp_path):
        ds = TrainingDataset(repo_root=tmp_path)
        ds.append([self._make_result()], "2026-W12")
        index_path = tmp_path / ".loop" / "retrain" / "dataset_index.json"
        assert index_path.exists()
        index = json.loads(index_path.read_text())
        assert index["total_examples"] == 1
        assert "2026-W12" in index["weeks"]


# ── TrainingLog tests ─────────────────────────────────────────────────────────


class TestTrainingLog:
    def _make_metrics(self, iteration: int = 1) -> CycleMetrics:
        return CycleMetrics(
            iteration=iteration,
            week="2026-W12",
            ran_at=datetime.now(tz=UTC).isoformat(),
            trajectories_total=10,
            trajectories_high=5,
            trajectories_medium=3,
            trajectories_low=2,
            trajectories_accepted=8,
            examples_added=5,
            dataset_total=5,
            train_status="completed",
            train_loss=1.2345,
            train_duration_seconds=120.5,
            adapter_path=".loop/retrain/adapters/iter_0001/adapters.npz",
            model_name="hermes4-14b-ft-0001",
            notes="First fine-tune cycle complete",
        )

    def test_next_iteration_starts_at_1(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        assert log.next_iteration() == 1

    def test_next_iteration_increments(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        log.record(self._make_metrics(iteration=1))
        assert log.next_iteration() == 2

    def test_record_creates_log_file(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        log.record(self._make_metrics())
        assert log.log_path.exists()

    def test_load_all_returns_records(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        log.record(self._make_metrics(iteration=1))
        log.record(self._make_metrics(iteration=2))
        entries = log.load_all()
        assert len(entries) == 2
        assert entries[0]["iteration"] == 1

    def test_latest_returns_last_entry(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        log.record(self._make_metrics(iteration=1))
        log.record(self._make_metrics(iteration=2))
        latest = log.latest()
        assert latest is not None
        assert latest["iteration"] == 2

    def test_latest_returns_none_when_empty(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        assert log.latest() is None

    def test_summary_markdown_written(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        log.record(self._make_metrics())
        summary_path = tmp_path / ".loop" / "retrain" / "training_log.md"
        assert summary_path.exists()
        content = summary_path.read_text()
        assert "AutoLoRA Training Log" in content
        assert "2026-W12" in content
        assert "completed" in content

    def test_skill_accuracy_in_summary(self, tmp_path):
        log = TrainingLog(repo_root=tmp_path)
        m = self._make_metrics()
        m.skill_accuracy = {"tool_calling": 0.85, "reasoning": 0.72}
        log.record(m)
        content = (tmp_path / ".loop" / "retrain" / "training_log.md").read_text()
        assert "tool_calling" in content
        assert "reasoning" in content


# ── RetrainOrchestrator integration tests ─────────────────────────────────────


class TestRetrainOrchestrator:
    def test_run_dry_run_no_data(self, tmp_path):
        """Dry run with no session logs should complete without errors."""
        (tmp_path / "logs").mkdir(parents=True)
        orc = RetrainOrchestrator(repo_root=tmp_path, dry_run=True)
        result = orc.run(weeks_ago=0)
        assert result.train_status in ("skipped",)
        assert result.examples_added == 0
        assert result.iteration == 1

    def test_run_creates_log_entry(self, tmp_path):
        (tmp_path / "logs").mkdir(parents=True)
        orc = RetrainOrchestrator(repo_root=tmp_path, dry_run=True)
        orc.run(weeks_ago=0)
        log = TrainingLog(repo_root=tmp_path)
        entries = log.load_all()
        assert len(entries) == 1

    def test_run_with_session_data(self, tmp_path):
        """Run with actual session data — should export, filter, and log."""
        today = datetime.now(tz=UTC)
        date_str = today.strftime("%Y-%m-%d")
        entries = [
            _user_msg("deploy the service", offset=-10),
            _tool_call("bash", "deployed successfully", offset=-9),
            _tool_call("bash", "health check ok", offset=-8),
            _timmy_msg("Service deployed and healthy", confidence=0.92, offset=-7),
            _user_msg("run the tests", offset=-6),
            _tool_call("bash", "All tests passed", offset=-5),
            _timmy_msg("All 42 tests passed", confidence=0.95, offset=-4),
        ]
        _make_session_log(entries, date_str, tmp_path)

        orc = RetrainOrchestrator(repo_root=tmp_path, dry_run=True)
        result = orc.run(weeks_ago=0)

        assert result.trajectories_exported >= 1
        assert result.iteration == 1
        # In dry_run mode, fine-tune is skipped but trajectories should be processed
        assert result.train_status == "skipped"

    def test_iteration_increments_on_second_run(self, tmp_path):
        (tmp_path / "logs").mkdir(parents=True)
        orc = RetrainOrchestrator(repo_root=tmp_path, dry_run=True)
        r1 = orc.run(weeks_ago=0)
        r2 = orc.run(weeks_ago=0)
        assert r2.iteration == r1.iteration + 1

    def test_automations_json_has_retrain_entry(self):
        """Verify the retrain automation is registered in automations.json."""
        config_path = _REPO_ROOT / "timmy_automations" / "config" / "automations.json"
        assert config_path.exists()
        manifest = json.loads(config_path.read_text())
        ids = [a["id"] for a in manifest.get("automations", [])]
        assert "retrain" in ids

    def test_retrain_automation_config(self):
        """Verify retrain automation has correct schedule and config."""
        config_path = _REPO_ROOT / "timmy_automations" / "config" / "automations.json"
        manifest = json.loads(config_path.read_text())
        retrain = next(a for a in manifest["automations"] if a["id"] == "retrain")
        assert retrain["schedule"] == "weekly_sunday"
        assert retrain["trigger"] == "scheduled"
        assert retrain["config"]["base_model"] == "hermes4-14b"
        assert retrain["config"]["weeks_ago"] == 1


_REPO_ROOT = Path(__file__).resolve().parent.parent.parent