520 lines
18 KiB
Python
520 lines
18 KiB
Python
"""Tests for the autoresearch module — autonomous ML experiment loops."""
|
|
|
|
import json
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
class TestAppleSiliconHelpers:
|
|
"""Tests for is_apple_silicon() and _build_experiment_env()."""
|
|
|
|
def test_is_apple_silicon_true_on_arm64_darwin(self):
|
|
from timmy.autoresearch import is_apple_silicon
|
|
|
|
with (
|
|
patch("timmy.autoresearch.platform.system", return_value="Darwin"),
|
|
patch("timmy.autoresearch.platform.machine", return_value="arm64"),
|
|
):
|
|
assert is_apple_silicon() is True
|
|
|
|
def test_is_apple_silicon_false_on_linux(self):
|
|
from timmy.autoresearch import is_apple_silicon
|
|
|
|
with (
|
|
patch("timmy.autoresearch.platform.system", return_value="Linux"),
|
|
patch("timmy.autoresearch.platform.machine", return_value="x86_64"),
|
|
):
|
|
assert is_apple_silicon() is False
|
|
|
|
def test_build_env_auto_resolves_mlx_on_apple_silicon(self):
|
|
from timmy.autoresearch import _build_experiment_env
|
|
|
|
with patch("timmy.autoresearch.is_apple_silicon", return_value=True):
|
|
env = _build_experiment_env(dataset="tinystories", backend="auto")
|
|
|
|
assert env["AUTORESEARCH_BACKEND"] == "mlx"
|
|
assert env["AUTORESEARCH_DATASET"] == "tinystories"
|
|
|
|
def test_build_env_auto_resolves_cuda_on_non_apple(self):
|
|
from timmy.autoresearch import _build_experiment_env
|
|
|
|
with patch("timmy.autoresearch.is_apple_silicon", return_value=False):
|
|
env = _build_experiment_env(dataset="openwebtext", backend="auto")
|
|
|
|
assert env["AUTORESEARCH_BACKEND"] == "cuda"
|
|
assert env["AUTORESEARCH_DATASET"] == "openwebtext"
|
|
|
|
def test_build_env_explicit_backend_not_overridden(self):
|
|
from timmy.autoresearch import _build_experiment_env
|
|
|
|
env = _build_experiment_env(dataset="tinystories", backend="cpu")
|
|
assert env["AUTORESEARCH_BACKEND"] == "cpu"
|
|
|
|
|
|
class TestPrepareExperiment:
|
|
"""Tests for prepare_experiment()."""
|
|
|
|
def test_clones_repo_when_not_present(self, tmp_path):
|
|
from timmy.autoresearch import prepare_experiment
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
|
prepare_experiment(tmp_path, "https://example.com/repo.git")
|
|
|
|
assert mock_run.call_count >= 1
|
|
clone_call = mock_run.call_args_list[0]
|
|
assert "git" in clone_call.args[0]
|
|
assert "clone" in clone_call.args[0]
|
|
|
|
def test_skips_clone_when_present(self, tmp_path):
|
|
from timmy.autoresearch import prepare_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
|
prepare_experiment(tmp_path)
|
|
|
|
# Should not call git clone
|
|
if mock_run.called:
|
|
assert "clone" not in str(mock_run.call_args_list[0])
|
|
|
|
def test_clone_failure_returns_error(self, tmp_path):
|
|
from timmy.autoresearch import prepare_experiment
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed")
|
|
result = prepare_experiment(tmp_path)
|
|
|
|
assert "failed" in result.lower()
|
|
|
|
def test_prepare_passes_env_to_prepare_script(self, tmp_path):
|
|
from timmy.autoresearch import prepare_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
(repo_dir / "prepare.py").write_text("pass")
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
|
prepare_experiment(tmp_path, dataset="tinystories", backend="cpu")
|
|
|
|
# The prepare.py call is the second call (first is skipped since repo exists)
|
|
prepare_call = mock_run.call_args
|
|
assert prepare_call.kwargs.get("env") is not None or prepare_call[1].get("env") is not None
|
|
call_kwargs = prepare_call.kwargs if prepare_call.kwargs else prepare_call[1]
|
|
assert call_kwargs["env"]["AUTORESEARCH_DATASET"] == "tinystories"
|
|
assert call_kwargs["env"]["AUTORESEARCH_BACKEND"] == "cpu"
|
|
|
|
|
|
class TestRunExperiment:
|
|
"""Tests for run_experiment()."""
|
|
|
|
def test_successful_run_extracts_metric(self, tmp_path):
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
(repo_dir / "train.py").write_text("print('training')")
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="step 1000 val_bpb: 1.2345\nstep 2000 val_bpb: 1.1234",
|
|
stderr="",
|
|
)
|
|
result = run_experiment(tmp_path, timeout=60)
|
|
|
|
assert result["success"] is True
|
|
assert result["metric"] == pytest.approx(1.1234)
|
|
assert result["error"] is None
|
|
|
|
def test_timeout_returns_error(self, tmp_path):
|
|
import subprocess
|
|
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
(repo_dir / "train.py").write_text("print('training')")
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd="python", timeout=5)
|
|
result = run_experiment(tmp_path, timeout=5)
|
|
|
|
assert result["success"] is False
|
|
assert "timed out" in result["error"].lower()
|
|
|
|
def test_missing_train_py(self, tmp_path):
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
# No train.py
|
|
|
|
result = run_experiment(tmp_path)
|
|
assert result["success"] is False
|
|
assert "not found" in result["error"].lower()
|
|
|
|
def test_no_metric_in_output(self, tmp_path):
|
|
from timmy.autoresearch import run_experiment
|
|
|
|
repo_dir = tmp_path / "autoresearch"
|
|
repo_dir.mkdir()
|
|
(repo_dir / "train.py").write_text("print('done')")
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="")
|
|
result = run_experiment(tmp_path)
|
|
|
|
assert result["success"] is True
|
|
assert result["metric"] is None
|
|
|
|
|
|
class TestEvaluateResult:
|
|
"""Tests for evaluate_result()."""
|
|
|
|
def test_improvement_detected(self):
|
|
from timmy.autoresearch import evaluate_result
|
|
|
|
result = evaluate_result(1.10, 1.20)
|
|
assert "improvement" in result.lower()
|
|
|
|
def test_regression_detected(self):
|
|
from timmy.autoresearch import evaluate_result
|
|
|
|
result = evaluate_result(1.30, 1.20)
|
|
assert "regression" in result.lower()
|
|
|
|
def test_no_change(self):
|
|
from timmy.autoresearch import evaluate_result
|
|
|
|
result = evaluate_result(1.20, 1.20)
|
|
assert "no change" in result.lower()
|
|
|
|
|
|
class TestExperimentHistory:
|
|
"""Tests for get_experiment_history()."""
|
|
|
|
def test_empty_workspace(self, tmp_path):
|
|
from timmy.autoresearch import get_experiment_history
|
|
|
|
history = get_experiment_history(tmp_path)
|
|
assert history == []
|
|
|
|
def test_reads_jsonl(self, tmp_path):
|
|
from timmy.autoresearch import get_experiment_history
|
|
|
|
results_file = tmp_path / "results.jsonl"
|
|
results_file.write_text(
|
|
json.dumps({"metric": 1.2, "success": True})
|
|
+ "\n"
|
|
+ json.dumps({"metric": 1.1, "success": True})
|
|
+ "\n"
|
|
)
|
|
|
|
history = get_experiment_history(tmp_path)
|
|
assert len(history) == 2
|
|
# Most recent first
|
|
assert history[0]["metric"] == 1.1
|
|
|
|
|
|
class TestExtractMetric:
|
|
"""Tests for _extract_metric()."""
|
|
|
|
def test_extracts_last_value(self):
|
|
from timmy.autoresearch import _extract_metric
|
|
|
|
output = "val_bpb: 1.5\nval_bpb: 1.3\nval_bpb: 1.1"
|
|
assert _extract_metric(output) == pytest.approx(1.1)
|
|
|
|
def test_no_match_returns_none(self):
|
|
from timmy.autoresearch import _extract_metric
|
|
|
|
assert _extract_metric("no metrics here") is None
|
|
|
|
def test_custom_metric_name(self):
|
|
from timmy.autoresearch import _extract_metric
|
|
|
|
output = "loss: 0.45\nloss: 0.32"
|
|
assert _extract_metric(output, "loss") == pytest.approx(0.32)
|
|
|
|
|
|
class TestExtractPassRate:
|
|
"""Tests for _extract_pass_rate()."""
|
|
|
|
def test_all_passing(self):
|
|
from timmy.autoresearch import _extract_pass_rate
|
|
|
|
output = "5 passed in 1.23s"
|
|
assert _extract_pass_rate(output) == pytest.approx(100.0)
|
|
|
|
def test_mixed_results(self):
|
|
from timmy.autoresearch import _extract_pass_rate
|
|
|
|
output = "8 passed, 2 failed in 2.00s"
|
|
assert _extract_pass_rate(output) == pytest.approx(80.0)
|
|
|
|
def test_no_pytest_output(self):
|
|
from timmy.autoresearch import _extract_pass_rate
|
|
|
|
assert _extract_pass_rate("no test results here") is None
|
|
|
|
|
|
class TestExtractCoverage:
|
|
"""Tests for _extract_coverage()."""
|
|
|
|
def test_total_line(self):
|
|
from timmy.autoresearch import _extract_coverage
|
|
|
|
output = "TOTAL 1234 100 92%"
|
|
assert _extract_coverage(output) == pytest.approx(92.0)
|
|
|
|
def test_no_coverage(self):
|
|
from timmy.autoresearch import _extract_coverage
|
|
|
|
assert _extract_coverage("no coverage data") is None
|
|
|
|
|
|
class TestSystemExperiment:
|
|
"""Tests for SystemExperiment class."""
|
|
|
|
def test_generate_hypothesis_with_program(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="src/timmy/agent.py")
|
|
hyp = exp.generate_hypothesis("Fix memory leak in session handling")
|
|
assert "src/timmy/agent.py" in hyp
|
|
assert "Fix memory leak" in hyp
|
|
|
|
def test_generate_hypothesis_fallback(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
|
|
hyp = exp.generate_hypothesis("")
|
|
assert "src/timmy/agent.py" in hyp
|
|
assert "coverage" in hyp
|
|
|
|
def test_generate_hypothesis_skips_comment_lines(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="mymodule.py")
|
|
hyp = exp.generate_hypothesis("# comment\nActual direction here")
|
|
assert "Actual direction" in hyp
|
|
|
|
def test_evaluate_baseline(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
|
result = exp.evaluate(85.0, None)
|
|
assert "Baseline" in result
|
|
assert "85" in result
|
|
|
|
def test_evaluate_improvement_higher_is_better(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
|
result = exp.evaluate(90.0, 85.0)
|
|
assert "Improvement" in result
|
|
|
|
def test_evaluate_regression_higher_is_better(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="coverage")
|
|
result = exp.evaluate(80.0, 85.0)
|
|
assert "Regression" in result
|
|
|
|
def test_evaluate_none_metric(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py")
|
|
result = exp.evaluate(None, 80.0)
|
|
assert "Indeterminate" in result
|
|
|
|
def test_evaluate_lower_is_better(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="val_bpb")
|
|
result = exp.evaluate(1.1, 1.2)
|
|
assert "Improvement" in result
|
|
|
|
def test_is_improvement_higher_is_better(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
|
assert exp.is_improvement(90.0, 85.0) is True
|
|
assert exp.is_improvement(80.0, 85.0) is False
|
|
|
|
def test_is_improvement_lower_is_better(self):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", metric="val_bpb")
|
|
assert exp.is_improvement(1.1, 1.2) is True
|
|
assert exp.is_improvement(1.3, 1.2) is False
|
|
|
|
def test_run_tox_success(self, tmp_path):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="8 passed in 1.23s",
|
|
stderr="",
|
|
)
|
|
result = exp.run_tox(tox_env="unit")
|
|
|
|
assert result["success"] is True
|
|
assert result["metric"] == pytest.approx(100.0)
|
|
|
|
def test_run_tox_timeout(self, tmp_path):
|
|
import subprocess
|
|
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
|
|
result = exp.run_tox()
|
|
|
|
assert result["success"] is False
|
|
assert "Budget exceeded" in result["error"]
|
|
|
|
def test_apply_edit_aider_not_installed(self, tmp_path):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.side_effect = FileNotFoundError("aider not found")
|
|
result = exp.apply_edit("some hypothesis")
|
|
|
|
assert "not available" in result
|
|
|
|
def test_commit_changes_success(self, tmp_path):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0)
|
|
success = exp.commit_changes("test commit")
|
|
|
|
assert success is True
|
|
|
|
def test_revert_changes_failure(self, tmp_path):
|
|
import subprocess
|
|
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
|
|
success = exp.revert_changes()
|
|
|
|
assert success is False
|
|
|
|
def test_create_branch_success(self, tmp_path):
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(returncode=0)
|
|
success = exp.create_branch("feature/test-branch")
|
|
|
|
assert success is True
|
|
# Verify correct git command was called
|
|
mock_run.assert_called_once()
|
|
call_args = mock_run.call_args[0][0]
|
|
assert "checkout" in call_args
|
|
assert "-b" in call_args
|
|
assert "feature/test-branch" in call_args
|
|
|
|
def test_create_branch_failure(self, tmp_path):
|
|
import subprocess
|
|
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
|
|
success = exp.create_branch("feature/test-branch")
|
|
|
|
assert success is False
|
|
|
|
def test_run_dry_run_mode(self, tmp_path):
|
|
"""Test that run() in dry_run mode only generates hypotheses."""
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
result = exp.run(max_iterations=3, dry_run=True, program_content="Test program")
|
|
|
|
assert result["iterations"] == 3
|
|
assert result["success"] is False # No actual experiments run
|
|
assert len(exp.results) == 3
|
|
# Each result should have a hypothesis
|
|
for record in exp.results:
|
|
assert "hypothesis" in record
|
|
|
|
def test_run_with_custom_metric_fn(self, tmp_path):
|
|
"""Test that custom metric_fn is used for metric extraction."""
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
def custom_metric_fn(output: str) -> float | None:
|
|
match = __import__("re").search(r"custom_metric:\s*([0-9.]+)", output)
|
|
return float(match.group(1)) if match else None
|
|
|
|
exp = SystemExperiment(
|
|
target="x.py",
|
|
workspace=tmp_path,
|
|
metric="custom",
|
|
metric_fn=custom_metric_fn,
|
|
)
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="custom_metric: 42.5\nother output",
|
|
stderr="",
|
|
)
|
|
tox_result = exp.run_tox()
|
|
|
|
assert tox_result["metric"] == pytest.approx(42.5)
|
|
|
|
def test_run_single_iteration_success(self, tmp_path):
|
|
"""Test a successful single iteration that finds an improvement."""
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
# Mock tox returning a passing test with metric
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="10 passed in 1.23s",
|
|
stderr="",
|
|
)
|
|
result = exp.run(max_iterations=1, tox_env="unit")
|
|
|
|
assert result["iterations"] == 1
|
|
assert len(exp.results) == 1
|
|
assert exp.results[0]["metric"] == pytest.approx(100.0)
|
|
|
|
def test_run_stores_baseline_on_first_success(self, tmp_path):
|
|
"""Test that baseline is set after first successful iteration."""
|
|
from timmy.autoresearch import SystemExperiment
|
|
|
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
|
assert exp.baseline is None
|
|
|
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(
|
|
returncode=0,
|
|
stdout="8 passed in 1.23s",
|
|
stderr="",
|
|
)
|
|
exp.run(max_iterations=1)
|
|
|
|
assert exp.baseline == pytest.approx(100.0)
|
|
assert exp.results[0]["baseline"] is None # First run has no baseline
|