Timmy-time-dashboard/tests/timmy/test_autoresearch_perplexity.py

"""Tests for using the autoresearch module with perplexity as the target metric.

Covers run integration, config override, history, and dashboard rendering when
`autoresearch_metric` is set to ``perplexity``.

Note: metric extraction and evaluation logic are already tested in
test_autoresearch.py — only perplexity-specific integration paths are tested here.
"""

from unittest.mock import MagicMock, patch

import pytest

# ── run_experiment with perplexity ──────────────────────────────────────


class TestRunExperimentPerplexity:
    """run_experiment must extract perplexity when metric_name='perplexity'."""

    def test_extracts_perplexity_from_stdout(self, tmp_path):
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('training')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["success"] is True
        assert result["metric"] == pytest.approx(61.2)
        assert result["error"] is None

    def test_returns_none_when_no_perplexity_logged(self, tmp_path):
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('done')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="training complete, no metrics",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["success"] is True
        assert result["metric"] is None

    def test_extracts_from_stderr_too(self, tmp_path):
        """Some frameworks log metrics to stderr."""
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("import sys")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="",
                stderr="perplexity: 72.3",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["metric"] == pytest.approx(72.3)


# ── 4. Config override ─────────────────────────────────────────────────


class TestPerplexityConfig:
    """Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""

    def test_config_accepts_perplexity(self):
        try:
            from config import settings
        except ImportError:
            pytest.skip("pydantic_settings not installed")

        original = settings.autoresearch_metric
        try:
            settings.autoresearch_metric = "perplexity"
            assert settings.autoresearch_metric == "perplexity"
        finally:
            settings.autoresearch_metric = original

    def test_run_experiment_receives_custom_metric(self, tmp_path):
        """Verify run_experiment correctly uses 'perplexity' metric_name."""
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('hi')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="perplexity: 44.2\nval_bpb: 1.05",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        # Must pick perplexity, not val_bpb
        assert result["metric"] == pytest.approx(44.2)


# ── 5. History with perplexity values ───────────────────────────────────


class TestPerplexityHistory:
    """Experiment history stores and retrieves perplexity values correctly."""

    def test_round_trips_through_jsonl(self, tmp_path):
        from timmy.autoresearch import _append_result, get_experiment_history

        _append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
        _append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})

        history = get_experiment_history(tmp_path)
        assert len(history) == 2
        assert history[0]["metric"] == 61.2  # most recent first
        assert history[1]["metric"] == 85.4

    def test_mixed_metric_history(self, tmp_path):
        """History may contain runs with different metrics (migration case)."""
        from timmy.autoresearch import _append_result, get_experiment_history

        _append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
        _append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})

        history = get_experiment_history(tmp_path)
        assert len(history) == 2
        assert history[0]["metric_name"] == "perplexity"


# ── 6. Dashboard route with perplexity ──────────────────────────────────


class TestExperimentsRoutePerplexity:
    """The /experiments dashboard renders correctly with perplexity metric."""

    def test_displays_perplexity_metric_name(self, client):
        try:
            from config import settings
        except ImportError:
            pytest.skip("pydantic_settings not installed")

        original_metric = settings.autoresearch_metric
        original_enabled = settings.autoresearch_enabled
        try:
            settings.autoresearch_metric = "perplexity"
            settings.autoresearch_enabled = True

            with patch("timmy.autoresearch.get_experiment_history", return_value=[]):
                resp = client.get("/experiments")

            assert resp.status_code == 200
        finally:
            settings.autoresearch_metric = original_metric
            settings.autoresearch_enabled = original_enabled