"""Tests for using the autoresearch module with perplexity as the target metric. Covers run integration, config override, history, and dashboard rendering when `autoresearch_metric` is set to ``perplexity``. Note: metric extraction and evaluation logic are already tested in test_autoresearch.py — only perplexity-specific integration paths are tested here. """ from unittest.mock import MagicMock, patch import pytest # ── run_experiment with perplexity ────────────────────────────────────── class TestRunExperimentPerplexity: """run_experiment must extract perplexity when metric_name='perplexity'.""" def test_extracts_perplexity_from_stdout(self, tmp_path): from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('training')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2", stderr="", ) result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") assert result["success"] is True assert result["metric"] == pytest.approx(61.2) assert result["error"] is None def test_returns_none_when_no_perplexity_logged(self, tmp_path): from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('done')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="training complete, no metrics", stderr="", ) result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") assert result["success"] is True assert result["metric"] is None def test_extracts_from_stderr_too(self, tmp_path): """Some frameworks log metrics to stderr.""" from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("import sys") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="", stderr="perplexity: 72.3", ) result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") assert result["metric"] == pytest.approx(72.3) # ── 4. Config override ───────────────────────────────────────────────── class TestPerplexityConfig: """Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call.""" def test_config_accepts_perplexity(self): try: from config import settings except ImportError: pytest.skip("pydantic_settings not installed") original = settings.autoresearch_metric try: settings.autoresearch_metric = "perplexity" assert settings.autoresearch_metric == "perplexity" finally: settings.autoresearch_metric = original def test_run_experiment_receives_custom_metric(self, tmp_path): """Verify run_experiment correctly uses 'perplexity' metric_name.""" from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('hi')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="perplexity: 44.2\nval_bpb: 1.05", stderr="", ) result = run_experiment(tmp_path, timeout=60, metric_name="perplexity") # Must pick perplexity, not val_bpb assert result["metric"] == pytest.approx(44.2) # ── 5. History with perplexity values ─────────────────────────────────── class TestPerplexityHistory: """Experiment history stores and retrieves perplexity values correctly.""" def test_round_trips_through_jsonl(self, tmp_path): from timmy.autoresearch import _append_result, get_experiment_history _append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"}) _append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"}) history = get_experiment_history(tmp_path) assert len(history) == 2 assert history[0]["metric"] == 61.2 # most recent first assert history[1]["metric"] == 85.4 def test_mixed_metric_history(self, tmp_path): """History may contain runs with different metrics (migration case).""" from timmy.autoresearch import _append_result, get_experiment_history _append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"}) _append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"}) history = get_experiment_history(tmp_path) assert len(history) == 2 assert history[0]["metric_name"] == "perplexity" # ── 6. Dashboard route with perplexity ────────────────────────────────── class TestExperimentsRoutePerplexity: """The /experiments dashboard renders correctly with perplexity metric.""" def test_displays_perplexity_metric_name(self, client): try: from config import settings except ImportError: pytest.skip("pydantic_settings not installed") original_metric = settings.autoresearch_metric original_enabled = settings.autoresearch_enabled try: settings.autoresearch_metric = "perplexity" settings.autoresearch_enabled = True with patch("timmy.autoresearch.get_experiment_history", return_value=[]): resp = client.get("/experiments") assert resp.status_code == 200 finally: settings.autoresearch_metric = original_metric settings.autoresearch_enabled = original_enabled