From 21e2ae427a5a36ac479673abce6c540517da5e8a Mon Sep 17 00:00:00 2001
From: Alexander Whitestone
 <8633216+AlexanderWhitestone@users.noreply.github.com>
Date: Mon, 9 Mar 2026 09:36:26 -0400
Subject: [PATCH] Add test plan for autoresearch with perplexity metric (#154)

---
 tests/timmy/test_autoresearch_perplexity.py | 250 ++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 tests/timmy/test_autoresearch_perplexity.py

diff --git a/tests/timmy/test_autoresearch_perplexity.py b/tests/timmy/test_autoresearch_perplexity.py
new file mode 100644
index 00000000..31091c55
--- /dev/null
+++ b/tests/timmy/test_autoresearch_perplexity.py
@@ -0,0 +1,250 @@
+"""Test plan for using the autoresearch module with perplexity as the target metric.
+
+Perplexity is a standard LM evaluation metric (lower = better), so the existing
+evaluate_result direction logic (lower-is-better) is correct without changes.
+
+The tests below verify every layer of the stack — metric extraction, evaluation,
+run integration, config override, tool wiring, and dashboard rendering — works
+correctly when `autoresearch_metric` is set to ``perplexity``.
+"""
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ── 1. Metric extraction ────────────────────────────────────────────────
+
+
+class TestExtractPerplexity:
+    """_extract_metric must find 'perplexity' values in training output."""
+
+    def test_extracts_perplexity_value(self):
+        from timmy.autoresearch import _extract_metric
+
+        output = "step 500 perplexity: 42.31\nstep 1000 perplexity: 38.05"
+        assert _extract_metric(output, "perplexity") == pytest.approx(38.05)
+
+    def test_extracts_last_occurrence(self):
+        from timmy.autoresearch import _extract_metric
+
+        output = (
+            "perplexity: 100.0\n"
+            "perplexity: 80.5\n"
+            "perplexity: 55.2\n"
+        )
+        assert _extract_metric(output, "perplexity") == pytest.approx(55.2)
+
+    def test_handles_integer_perplexity(self):
+        from timmy.autoresearch import _extract_metric
+
+        output = "perplexity: 42"
+        assert _extract_metric(output, "perplexity") == pytest.approx(42.0)
+
+    def test_handles_space_separator(self):
+        """Some training scripts use 'perplexity 38.5' without a colon."""
+        from timmy.autoresearch import _extract_metric
+
+        output = "perplexity 38.5"
+        assert _extract_metric(output, "perplexity") == pytest.approx(38.5)
+
+    def test_returns_none_when_absent(self):
+        from timmy.autoresearch import _extract_metric
+
+        assert _extract_metric("loss: 0.45", "perplexity") is None
+
+    def test_ignores_unrelated_numbers(self):
+        from timmy.autoresearch import _extract_metric
+
+        output = "step 500 lr 0.001 loss 2.3\nperplexity: 50.1"
+        assert _extract_metric(output, "perplexity") == pytest.approx(50.1)
+
+
+# ── 2. Evaluation with perplexity ───────────────────────────────────────
+
+
+class TestEvaluatePerplexity:
+    """evaluate_result should treat lower perplexity as an improvement."""
+
+    def test_lower_is_improvement(self):
+        from timmy.autoresearch import evaluate_result
+
+        result = evaluate_result(35.0, 42.0, metric_name="perplexity")
+        assert "improvement" in result.lower()
+        assert "perplexity" in result.lower()
+
+    def test_higher_is_regression(self):
+        from timmy.autoresearch import evaluate_result
+
+        result = evaluate_result(50.0, 42.0, metric_name="perplexity")
+        assert "regression" in result.lower()
+
+    def test_equal_is_no_change(self):
+        from timmy.autoresearch import evaluate_result
+
+        result = evaluate_result(42.0, 42.0, metric_name="perplexity")
+        assert "no change" in result.lower()
+
+    def test_percentage_is_correct(self):
+        from timmy.autoresearch import evaluate_result
+
+        # 40 -> 30 is a -25% change
+        result = evaluate_result(30.0, 40.0, metric_name="perplexity")
+        assert "-25.00%" in result
+
+
+# ── 3. run_experiment with perplexity ───────────────────────────────────
+
+
+class TestRunExperimentPerplexity:
+    """run_experiment must extract perplexity when metric_name='perplexity'."""
+
+    def test_extracts_perplexity_from_stdout(self, tmp_path):
+        from timmy.autoresearch import run_experiment
+
+        repo_dir = tmp_path / "autoresearch"
+        repo_dir.mkdir()
+        (repo_dir / "train.py").write_text("print('training')")
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
+                stderr="",
+            )
+            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
+
+        assert result["success"] is True
+        assert result["metric"] == pytest.approx(61.2)
+        assert result["error"] is None
+
+    def test_returns_none_when_no_perplexity_logged(self, tmp_path):
+        from timmy.autoresearch import run_experiment
+
+        repo_dir = tmp_path / "autoresearch"
+        repo_dir.mkdir()
+        (repo_dir / "train.py").write_text("print('done')")
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="training complete, no metrics",
+                stderr="",
+            )
+            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
+
+        assert result["success"] is True
+        assert result["metric"] is None
+
+    def test_extracts_from_stderr_too(self, tmp_path):
+        """Some frameworks log metrics to stderr."""
+        from timmy.autoresearch import run_experiment
+
+        repo_dir = tmp_path / "autoresearch"
+        repo_dir.mkdir()
+        (repo_dir / "train.py").write_text("import sys")
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="",
+                stderr="perplexity: 72.3",
+            )
+            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
+
+        assert result["metric"] == pytest.approx(72.3)
+
+
+# ── 4. Config override ─────────────────────────────────────────────────
+
+
+class TestPerplexityConfig:
+    """Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""
+
+    def test_config_accepts_perplexity(self):
+        try:
+            from config import settings
+        except ImportError:
+            pytest.skip("pydantic_settings not installed")
+
+        original = settings.autoresearch_metric
+        try:
+            settings.autoresearch_metric = "perplexity"
+            assert settings.autoresearch_metric == "perplexity"
+        finally:
+            settings.autoresearch_metric = original
+
+    def test_run_experiment_receives_custom_metric(self, tmp_path):
+        """Verify run_experiment correctly uses 'perplexity' metric_name."""
+        from timmy.autoresearch import run_experiment
+
+        repo_dir = tmp_path / "autoresearch"
+        repo_dir.mkdir()
+        (repo_dir / "train.py").write_text("print('hi')")
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="perplexity: 44.2\nval_bpb: 1.05",
+                stderr="",
+            )
+            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
+
+        # Must pick perplexity, not val_bpb
+        assert result["metric"] == pytest.approx(44.2)
+
+
+# ── 5. History with perplexity values ───────────────────────────────────
+
+
+class TestPerplexityHistory:
+    """Experiment history stores and retrieves perplexity values correctly."""
+
+    def test_round_trips_through_jsonl(self, tmp_path):
+        from timmy.autoresearch import _append_result, get_experiment_history
+
+        _append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
+        _append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})
+
+        history = get_experiment_history(tmp_path)
+        assert len(history) == 2
+        assert history[0]["metric"] == 61.2  # most recent first
+        assert history[1]["metric"] == 85.4
+
+    def test_mixed_metric_history(self, tmp_path):
+        """History may contain runs with different metrics (migration case)."""
+        from timmy.autoresearch import _append_result, get_experiment_history
+
+        _append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
+        _append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})
+
+        history = get_experiment_history(tmp_path)
+        assert len(history) == 2
+        assert history[0]["metric_name"] == "perplexity"
+
+
+# ── 6. Dashboard route with perplexity ──────────────────────────────────
+
+
+class TestExperimentsRoutePerplexity:
+    """The /experiments dashboard renders correctly with perplexity metric."""
+
+    def test_displays_perplexity_metric_name(self, client):
+        try:
+            from config import settings
+        except ImportError:
+            pytest.skip("pydantic_settings not installed")
+
+        original = settings.autoresearch_metric
+        try:
+            settings.autoresearch_metric = "perplexity"
+            settings.autoresearch_enabled = True
+
+            with patch("dashboard.routes.experiments.get_experiment_history", return_value=[]):
+                resp = client.get("/experiments")
+
+            assert resp.status_code == 200
+        finally:
+            settings.autoresearch_metric = original