From 21e2ae427a5a36ac479673abce6c540517da5e8a Mon Sep 17 00:00:00 2001 From: Alexander Whitestone <8633216+AlexanderWhitestone@users.noreply.github.com> Date: Mon, 9 Mar 2026 09:36:26 -0400 Subject: [PATCH] Add test plan for autoresearch with perplexity metric (#154) --- tests/timmy/test_autoresearch_perplexity.py | 250 ++++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 tests/timmy/test_autoresearch_perplexity.py diff --git a/tests/timmy/test_autoresearch_perplexity.py b/tests/timmy/test_autoresearch_perplexity.py new file mode 100644 index 00000000..31091c55 --- /dev/null +++ b/tests/timmy/test_autoresearch_perplexity.py @@ -0,0 +1,250 @@ +"""Test plan for using the autoresearch module with perplexity as the target metric. + +Perplexity is a standard LM evaluation metric (lower = better), so the existing +evaluate_result direction logic (lower-is-better) is correct without changes. + +The tests below verify every layer of the stack — metric extraction, evaluation, +run integration, config override, tool wiring, and dashboard rendering — works +correctly when `autoresearch_metric` is set to ``perplexity``. +""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +# ── 1. Metric extraction ──────────────────────────────────────────────── + + +class TestExtractPerplexity: + """_extract_metric must find 'perplexity' values in training output.""" + + def test_extracts_perplexity_value(self): + from timmy.autoresearch import _extract_metric + + output = "step 500 perplexity: 42.31\nstep 1000 perplexity: 38.05" + assert _extract_metric(output, "perplexity") == pytest.approx(38.05) + + def test_extracts_last_occurrence(self): + from timmy.autoresearch import _extract_metric + + output = ( + "perplexity: 100.0\n" + "perplexity: 80.5\n" + "perplexity: 55.2\n" + ) + assert _extract_metric(output, "perplexity") == pytest.approx(55.2) + + def test_handles_integer_perplexity(self): + from timmy.autoresearch import _extract_metric + + output = "perplexity: 42" + assert _extract_metric(output, "perplexity") == pytest.approx(42.0) + + def test_handles_space_separator(self): + """Some training scripts use 'perplexity 38.5' without a colon.""" + from timmy.autoresearch import _extract_metric + + output = "perplexity 38.5" + assert _extract_metric(output, "perplexity") == pytest.approx(38.5) + + def test_returns_none_when_absent(self): + from timmy.autoresearch import _extract_metric + + assert _extract_metric("loss: 0.45", "perplexity") is None + + def test_ignores_unrelated_numbers(self): + from timmy.autoresearch import _extract_metric + + output = "step 500 lr 0.001 loss 2.3\nperplexity: 50.1" + assert _extract_metric(output, "perplexity") == pytest.approx(50.1) + + +# ── 2. Evaluation with perplexity ─────────────────────────────────────── + + +class TestEvaluatePerplexity: + """evaluate_result should treat lower perplexity as an improvement.""" + + def test_lower_is_improvement(self): + from timmy.autoresearch import evaluate_result + + result = evaluate_result(35.0, 42.0, metric_name="perplexity") + assert "improvement" in result.lower() + assert "perplexity" in result.lower() + + def test_higher_is_regression(self): + from timmy.autoresearch import evaluate_result + + result = evaluate_result(50.0, 42.0, metric_name="perplexity") + assert "regression" in result.lower() + + def test_equal_is_no_change(self): + from timmy.autoresearch import evaluate_result + + result = evaluate_result(42.0, 42.0, metric_name="perplexity") + assert "no change" in result.lower() + + def test_percentage_is_correct(self): + from timmy.autoresearch import evaluate_result + + # 40 -> 30 is a -25% change + result = evaluate_result(30.0, 40.0, metric_name="perplexity") + assert "-25.00%" in result + + +# ── 3. run_experiment with perplexity ─────────────────────────────────── + + +class TestRunExperimentPerplexity: + """run_experiment must extract perplexity when metric_name='perplexity'.""" + + def test_extracts_perplexity_from_stdout(self, tmp_path): + from timmy.autoresearch import run_experiment + + repo_dir = tmp_path / "autoresearch" + repo_dir.mkdir() + (repo_dir / "train.py").write_text("print('training')") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2", + stderr="", + ) + result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") + + assert result["success"] is True + assert result["metric"] == pytest.approx(61.2) + assert result["error"] is None + + def test_returns_none_when_no_perplexity_logged(self, tmp_path): + from timmy.autoresearch import run_experiment + + repo_dir = tmp_path / "autoresearch" + repo_dir.mkdir() + (repo_dir / "train.py").write_text("print('done')") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="training complete, no metrics", + stderr="", + ) + result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") + + assert result["success"] is True + assert result["metric"] is None + + def test_extracts_from_stderr_too(self, tmp_path): + """Some frameworks log metrics to stderr.""" + from timmy.autoresearch import run_experiment + + repo_dir = tmp_path / "autoresearch" + repo_dir.mkdir() + (repo_dir / "train.py").write_text("import sys") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="", + stderr="perplexity: 72.3", + ) + result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") + + assert result["metric"] == pytest.approx(72.3) + + +# ── 4. Config override ───────────────────────────────────────────────── + + +class TestPerplexityConfig: + """Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call.""" + + def test_config_accepts_perplexity(self): + try: + from config import settings + except ImportError: + pytest.skip("pydantic_settings not installed") + + original = settings.autoresearch_metric + try: + settings.autoresearch_metric = "perplexity" + assert settings.autoresearch_metric == "perplexity" + finally: + settings.autoresearch_metric = original + + def test_run_experiment_receives_custom_metric(self, tmp_path): + """Verify run_experiment correctly uses 'perplexity' metric_name.""" + from timmy.autoresearch import run_experiment + + repo_dir = tmp_path / "autoresearch" + repo_dir.mkdir() + (repo_dir / "train.py").write_text("print('hi')") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="perplexity: 44.2\nval_bpb: 1.05", + stderr="", + ) + result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") + + # Must pick perplexity, not val_bpb + assert result["metric"] == pytest.approx(44.2) + + +# ── 5. History with perplexity values ─────────────────────────────────── + + +class TestPerplexityHistory: + """Experiment history stores and retrieves perplexity values correctly.""" + + def test_round_trips_through_jsonl(self, tmp_path): + from timmy.autoresearch import _append_result, get_experiment_history + + _append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"}) + _append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"}) + + history = get_experiment_history(tmp_path) + assert len(history) == 2 + assert history[0]["metric"] == 61.2 # most recent first + assert history[1]["metric"] == 85.4 + + def test_mixed_metric_history(self, tmp_path): + """History may contain runs with different metrics (migration case).""" + from timmy.autoresearch import _append_result, get_experiment_history + + _append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"}) + _append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"}) + + history = get_experiment_history(tmp_path) + assert len(history) == 2 + assert history[0]["metric_name"] == "perplexity" + + +# ── 6. Dashboard route with perplexity ────────────────────────────────── + + +class TestExperimentsRoutePerplexity: + """The /experiments dashboard renders correctly with perplexity metric.""" + + def test_displays_perplexity_metric_name(self, client): + try: + from config import settings + except ImportError: + pytest.skip("pydantic_settings not installed") + + original = settings.autoresearch_metric + try: + settings.autoresearch_metric = "perplexity" + settings.autoresearch_enabled = True + + with patch("dashboard.routes.experiments.get_experiment_history", return_value=[]): + resp = client.get("/experiments") + + assert resp.status_code == 200 + finally: + settings.autoresearch_metric = original