2026-03-11 16:55:27 -04:00
|
|
|
"""Tests for using the autoresearch module with perplexity as the target metric.
|
2026-03-09 09:36:26 -04:00
|
|
|
|
2026-03-11 16:55:27 -04:00
|
|
|
Covers run integration, config override, history, and dashboard rendering when
|
|
|
|
|
`autoresearch_metric` is set to ``perplexity``.
|
2026-03-09 09:36:26 -04:00
|
|
|
|
2026-03-11 16:55:27 -04:00
|
|
|
Note: metric extraction and evaluation logic are already tested in
|
|
|
|
|
test_autoresearch.py — only perplexity-specific integration paths are tested here.
|
2026-03-09 09:36:26 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2026-03-11 16:55:27 -04:00
|
|
|
# ── run_experiment with perplexity ──────────────────────────────────────
|
2026-03-09 09:36:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestRunExperimentPerplexity:
|
|
|
|
|
"""run_experiment must extract perplexity when metric_name='perplexity'."""
|
|
|
|
|
|
|
|
|
|
def test_extracts_perplexity_from_stdout(self, tmp_path):
|
|
|
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
|
|
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
|
|
|
repo_dir.mkdir()
|
|
|
|
|
(repo_dir / "train.py").write_text("print('training')")
|
|
|
|
|
|
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
|
|
|
mock_run.return_value = MagicMock(
|
|
|
|
|
returncode=0,
|
|
|
|
|
stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
|
|
|
|
|
stderr="",
|
|
|
|
|
)
|
|
|
|
|
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
|
|
|
|
|
|
|
|
|
|
assert result["success"] is True
|
|
|
|
|
assert result["metric"] == pytest.approx(61.2)
|
|
|
|
|
assert result["error"] is None
|
|
|
|
|
|
|
|
|
|
def test_returns_none_when_no_perplexity_logged(self, tmp_path):
|
|
|
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
|
|
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
|
|
|
repo_dir.mkdir()
|
|
|
|
|
(repo_dir / "train.py").write_text("print('done')")
|
|
|
|
|
|
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
|
|
|
mock_run.return_value = MagicMock(
|
|
|
|
|
returncode=0,
|
|
|
|
|
stdout="training complete, no metrics",
|
|
|
|
|
stderr="",
|
|
|
|
|
)
|
|
|
|
|
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
|
|
|
|
|
|
|
|
|
|
assert result["success"] is True
|
|
|
|
|
assert result["metric"] is None
|
|
|
|
|
|
|
|
|
|
def test_extracts_from_stderr_too(self, tmp_path):
|
|
|
|
|
"""Some frameworks log metrics to stderr."""
|
|
|
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
|
|
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
|
|
|
repo_dir.mkdir()
|
|
|
|
|
(repo_dir / "train.py").write_text("import sys")
|
|
|
|
|
|
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
|
|
|
mock_run.return_value = MagicMock(
|
|
|
|
|
returncode=0,
|
|
|
|
|
stdout="",
|
|
|
|
|
stderr="perplexity: 72.3",
|
|
|
|
|
)
|
|
|
|
|
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
|
|
|
|
|
|
|
|
|
|
assert result["metric"] == pytest.approx(72.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── 4. Config override ─────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestPerplexityConfig:
|
|
|
|
|
"""Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""
|
|
|
|
|
|
|
|
|
|
def test_config_accepts_perplexity(self):
|
|
|
|
|
try:
|
|
|
|
|
from config import settings
|
|
|
|
|
except ImportError:
|
|
|
|
|
pytest.skip("pydantic_settings not installed")
|
|
|
|
|
|
|
|
|
|
original = settings.autoresearch_metric
|
|
|
|
|
try:
|
|
|
|
|
settings.autoresearch_metric = "perplexity"
|
|
|
|
|
assert settings.autoresearch_metric == "perplexity"
|
|
|
|
|
finally:
|
|
|
|
|
settings.autoresearch_metric = original
|
|
|
|
|
|
|
|
|
|
def test_run_experiment_receives_custom_metric(self, tmp_path):
|
|
|
|
|
"""Verify run_experiment correctly uses 'perplexity' metric_name."""
|
|
|
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
|
|
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
|
|
|
repo_dir.mkdir()
|
|
|
|
|
(repo_dir / "train.py").write_text("print('hi')")
|
|
|
|
|
|
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
|
|
|
mock_run.return_value = MagicMock(
|
|
|
|
|
returncode=0,
|
|
|
|
|
stdout="perplexity: 44.2\nval_bpb: 1.05",
|
|
|
|
|
stderr="",
|
|
|
|
|
)
|
|
|
|
|
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
|
|
|
|
|
|
|
|
|
|
# Must pick perplexity, not val_bpb
|
|
|
|
|
assert result["metric"] == pytest.approx(44.2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── 5. History with perplexity values ───────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestPerplexityHistory:
|
|
|
|
|
"""Experiment history stores and retrieves perplexity values correctly."""
|
|
|
|
|
|
|
|
|
|
def test_round_trips_through_jsonl(self, tmp_path):
|
|
|
|
|
from timmy.autoresearch import _append_result, get_experiment_history
|
|
|
|
|
|
|
|
|
|
_append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
|
|
|
|
|
_append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})
|
|
|
|
|
|
|
|
|
|
history = get_experiment_history(tmp_path)
|
|
|
|
|
assert len(history) == 2
|
|
|
|
|
assert history[0]["metric"] == 61.2 # most recent first
|
|
|
|
|
assert history[1]["metric"] == 85.4
|
|
|
|
|
|
|
|
|
|
def test_mixed_metric_history(self, tmp_path):
|
|
|
|
|
"""History may contain runs with different metrics (migration case)."""
|
|
|
|
|
from timmy.autoresearch import _append_result, get_experiment_history
|
|
|
|
|
|
|
|
|
|
_append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
|
|
|
|
|
_append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})
|
|
|
|
|
|
|
|
|
|
history = get_experiment_history(tmp_path)
|
|
|
|
|
assert len(history) == 2
|
|
|
|
|
assert history[0]["metric_name"] == "perplexity"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── 6. Dashboard route with perplexity ──────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestExperimentsRoutePerplexity:
|
|
|
|
|
"""The /experiments dashboard renders correctly with perplexity metric."""
|
|
|
|
|
|
|
|
|
|
def test_displays_perplexity_metric_name(self, client):
|
|
|
|
|
try:
|
|
|
|
|
from config import settings
|
|
|
|
|
except ImportError:
|
|
|
|
|
pytest.skip("pydantic_settings not installed")
|
|
|
|
|
|
2026-03-11 16:55:27 -04:00
|
|
|
original_metric = settings.autoresearch_metric
|
|
|
|
|
original_enabled = settings.autoresearch_enabled
|
2026-03-09 09:36:26 -04:00
|
|
|
try:
|
|
|
|
|
settings.autoresearch_metric = "perplexity"
|
|
|
|
|
settings.autoresearch_enabled = True
|
|
|
|
|
|
2026-03-09 15:01:00 -04:00
|
|
|
with patch("timmy.autoresearch.get_experiment_history", return_value=[]):
|
2026-03-09 09:36:26 -04:00
|
|
|
resp = client.get("/experiments")
|
|
|
|
|
|
|
|
|
|
assert resp.status_code == 200
|
|
|
|
|
finally:
|
2026-03-11 16:55:27 -04:00
|
|
|
settings.autoresearch_metric = original_metric
|
|
|
|
|
settings.autoresearch_enabled = original_enabled
|