"""Tests for the autoresearch module — autonomous ML experiment loops.""" import json from unittest.mock import MagicMock, patch import pytest class TestAppleSiliconHelpers: """Tests for is_apple_silicon() and _build_experiment_env().""" def test_is_apple_silicon_true_on_arm64_darwin(self): from timmy.autoresearch import is_apple_silicon with ( patch("timmy.autoresearch.platform.system", return_value="Darwin"), patch("timmy.autoresearch.platform.machine", return_value="arm64"), ): assert is_apple_silicon() is True def test_is_apple_silicon_false_on_linux(self): from timmy.autoresearch import is_apple_silicon with ( patch("timmy.autoresearch.platform.system", return_value="Linux"), patch("timmy.autoresearch.platform.machine", return_value="x86_64"), ): assert is_apple_silicon() is False def test_build_env_auto_resolves_mlx_on_apple_silicon(self): from timmy.autoresearch import _build_experiment_env with patch("timmy.autoresearch.is_apple_silicon", return_value=True): env = _build_experiment_env(dataset="tinystories", backend="auto") assert env["AUTORESEARCH_BACKEND"] == "mlx" assert env["AUTORESEARCH_DATASET"] == "tinystories" def test_build_env_auto_resolves_cuda_on_non_apple(self): from timmy.autoresearch import _build_experiment_env with patch("timmy.autoresearch.is_apple_silicon", return_value=False): env = _build_experiment_env(dataset="openwebtext", backend="auto") assert env["AUTORESEARCH_BACKEND"] == "cuda" assert env["AUTORESEARCH_DATASET"] == "openwebtext" def test_build_env_explicit_backend_not_overridden(self): from timmy.autoresearch import _build_experiment_env env = _build_experiment_env(dataset="tinystories", backend="cpu") assert env["AUTORESEARCH_BACKEND"] == "cpu" class TestPrepareExperiment: """Tests for prepare_experiment().""" def test_clones_repo_when_not_present(self, tmp_path): from timmy.autoresearch import prepare_experiment with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") prepare_experiment(tmp_path, "https://example.com/repo.git") assert mock_run.call_count >= 1 clone_call = mock_run.call_args_list[0] assert "git" in clone_call.args[0] assert "clone" in clone_call.args[0] def test_skips_clone_when_present(self, tmp_path): from timmy.autoresearch import prepare_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") prepare_experiment(tmp_path) # Should not call git clone if mock_run.called: assert "clone" not in str(mock_run.call_args_list[0]) def test_clone_failure_returns_error(self, tmp_path): from timmy.autoresearch import prepare_experiment with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed") result = prepare_experiment(tmp_path) assert "failed" in result.lower() def test_prepare_passes_env_to_prepare_script(self, tmp_path): from timmy.autoresearch import prepare_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "prepare.py").write_text("pass") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") prepare_experiment(tmp_path, dataset="tinystories", backend="cpu") # The prepare.py call is the second call (first is skipped since repo exists) prepare_call = mock_run.call_args assert prepare_call.kwargs.get("env") is not None or prepare_call[1].get("env") is not None call_kwargs = prepare_call.kwargs if prepare_call.kwargs else prepare_call[1] assert call_kwargs["env"]["AUTORESEARCH_DATASET"] == "tinystories" assert call_kwargs["env"]["AUTORESEARCH_BACKEND"] == "cpu" class TestRunExperiment: """Tests for run_experiment().""" def test_successful_run_extracts_metric(self, tmp_path): from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('training')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="step 1000 val_bpb: 1.2345\nstep 2000 val_bpb: 1.1234", stderr="", ) result = run_experiment(tmp_path, timeout=60) assert result["success"] is True assert result["metric"] == pytest.approx(1.1234) assert result["error"] is None def test_timeout_returns_error(self, tmp_path): import subprocess from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('training')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.side_effect = subprocess.TimeoutExpired(cmd="python", timeout=5) result = run_experiment(tmp_path, timeout=5) assert result["success"] is False assert "timed out" in result["error"].lower() def test_missing_train_py(self, tmp_path): from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() # No train.py result = run_experiment(tmp_path) assert result["success"] is False assert "not found" in result["error"].lower() def test_no_metric_in_output(self, tmp_path): from timmy.autoresearch import run_experiment repo_dir = tmp_path / "autoresearch" repo_dir.mkdir() (repo_dir / "train.py").write_text("print('done')") with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="") result = run_experiment(tmp_path) assert result["success"] is True assert result["metric"] is None class TestEvaluateResult: """Tests for evaluate_result().""" def test_improvement_detected(self): from timmy.autoresearch import evaluate_result result = evaluate_result(1.10, 1.20) assert "improvement" in result.lower() def test_regression_detected(self): from timmy.autoresearch import evaluate_result result = evaluate_result(1.30, 1.20) assert "regression" in result.lower() def test_no_change(self): from timmy.autoresearch import evaluate_result result = evaluate_result(1.20, 1.20) assert "no change" in result.lower() class TestExperimentHistory: """Tests for get_experiment_history().""" def test_empty_workspace(self, tmp_path): from timmy.autoresearch import get_experiment_history history = get_experiment_history(tmp_path) assert history == [] def test_reads_jsonl(self, tmp_path): from timmy.autoresearch import get_experiment_history results_file = tmp_path / "results.jsonl" results_file.write_text( json.dumps({"metric": 1.2, "success": True}) + "\n" + json.dumps({"metric": 1.1, "success": True}) + "\n" ) history = get_experiment_history(tmp_path) assert len(history) == 2 # Most recent first assert history[0]["metric"] == 1.1 class TestExtractMetric: """Tests for _extract_metric().""" def test_extracts_last_value(self): from timmy.autoresearch import _extract_metric output = "val_bpb: 1.5\nval_bpb: 1.3\nval_bpb: 1.1" assert _extract_metric(output) == pytest.approx(1.1) def test_no_match_returns_none(self): from timmy.autoresearch import _extract_metric assert _extract_metric("no metrics here") is None def test_custom_metric_name(self): from timmy.autoresearch import _extract_metric output = "loss: 0.45\nloss: 0.32" assert _extract_metric(output, "loss") == pytest.approx(0.32) class TestExtractPassRate: """Tests for _extract_pass_rate().""" def test_all_passing(self): from timmy.autoresearch import _extract_pass_rate output = "5 passed in 1.23s" assert _extract_pass_rate(output) == pytest.approx(100.0) def test_mixed_results(self): from timmy.autoresearch import _extract_pass_rate output = "8 passed, 2 failed in 2.00s" assert _extract_pass_rate(output) == pytest.approx(80.0) def test_no_pytest_output(self): from timmy.autoresearch import _extract_pass_rate assert _extract_pass_rate("no test results here") is None class TestExtractCoverage: """Tests for _extract_coverage().""" def test_total_line(self): from timmy.autoresearch import _extract_coverage output = "TOTAL 1234 100 92%" assert _extract_coverage(output) == pytest.approx(92.0) def test_no_coverage(self): from timmy.autoresearch import _extract_coverage assert _extract_coverage("no coverage data") is None class TestSystemExperiment: """Tests for SystemExperiment class.""" def test_generate_hypothesis_with_program(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="src/timmy/agent.py") hyp = exp.generate_hypothesis("Fix memory leak in session handling") assert "src/timmy/agent.py" in hyp assert "Fix memory leak" in hyp def test_generate_hypothesis_fallback(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage") hyp = exp.generate_hypothesis("") assert "src/timmy/agent.py" in hyp assert "coverage" in hyp def test_generate_hypothesis_skips_comment_lines(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="mymodule.py") hyp = exp.generate_hypothesis("# comment\nActual direction here") assert "Actual direction" in hyp def test_evaluate_baseline(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="unit_pass_rate") result = exp.evaluate(85.0, None) assert "Baseline" in result assert "85" in result def test_evaluate_improvement_higher_is_better(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="unit_pass_rate") result = exp.evaluate(90.0, 85.0) assert "Improvement" in result def test_evaluate_regression_higher_is_better(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="coverage") result = exp.evaluate(80.0, 85.0) assert "Regression" in result def test_evaluate_none_metric(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py") result = exp.evaluate(None, 80.0) assert "Indeterminate" in result def test_evaluate_lower_is_better(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="val_bpb") result = exp.evaluate(1.1, 1.2) assert "Improvement" in result def test_is_improvement_higher_is_better(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="unit_pass_rate") assert exp.is_improvement(90.0, 85.0) is True assert exp.is_improvement(80.0, 85.0) is False def test_is_improvement_lower_is_better(self): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", metric="val_bpb") assert exp.is_improvement(1.1, 1.2) is True assert exp.is_improvement(1.3, 1.2) is False def test_run_tox_success(self, tmp_path): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="8 passed in 1.23s", stderr="", ) result = exp.run_tox(tox_env="unit") assert result["success"] is True assert result["metric"] == pytest.approx(100.0) def test_run_tox_timeout(self, tmp_path): import subprocess from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60) result = exp.run_tox() assert result["success"] is False assert "Budget exceeded" in result["error"] def test_apply_edit_aider_not_installed(self, tmp_path): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.side_effect = FileNotFoundError("aider not found") result = exp.apply_edit("some hypothesis") assert "not available" in result def test_commit_changes_success(self, tmp_path): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0) success = exp.commit_changes("test commit") assert success is True def test_revert_changes_failure(self, tmp_path): import subprocess from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.side_effect = subprocess.CalledProcessError(1, "git") success = exp.revert_changes() assert success is False def test_create_branch_success(self, tmp_path): from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=0) success = exp.create_branch("feature/test-branch") assert success is True # Verify correct git command was called mock_run.assert_called_once() call_args = mock_run.call_args[0][0] assert "checkout" in call_args assert "-b" in call_args assert "feature/test-branch" in call_args def test_create_branch_failure(self, tmp_path): import subprocess from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.side_effect = subprocess.CalledProcessError(1, "git") success = exp.create_branch("feature/test-branch") assert success is False def test_run_dry_run_mode(self, tmp_path): """Test that run() in dry_run mode only generates hypotheses.""" from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) result = exp.run(max_iterations=3, dry_run=True, program_content="Test program") assert result["iterations"] == 3 assert result["success"] is False # No actual experiments run assert len(exp.results) == 3 # Each result should have a hypothesis for record in exp.results: assert "hypothesis" in record def test_run_with_custom_metric_fn(self, tmp_path): """Test that custom metric_fn is used for metric extraction.""" from timmy.autoresearch import SystemExperiment def custom_metric_fn(output: str) -> float | None: match = __import__("re").search(r"custom_metric:\s*([0-9.]+)", output) return float(match.group(1)) if match else None exp = SystemExperiment( target="x.py", workspace=tmp_path, metric="custom", metric_fn=custom_metric_fn, ) with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="custom_metric: 42.5\nother output", stderr="", ) tox_result = exp.run_tox() assert tox_result["metric"] == pytest.approx(42.5) def test_run_single_iteration_success(self, tmp_path): """Test a successful single iteration that finds an improvement.""" from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) with patch("timmy.autoresearch.subprocess.run") as mock_run: # Mock tox returning a passing test with metric mock_run.return_value = MagicMock( returncode=0, stdout="10 passed in 1.23s", stderr="", ) result = exp.run(max_iterations=1, tox_env="unit") assert result["iterations"] == 1 assert len(exp.results) == 1 assert exp.results[0]["metric"] == pytest.approx(100.0) def test_run_stores_baseline_on_first_success(self, tmp_path): """Test that baseline is set after first successful iteration.""" from timmy.autoresearch import SystemExperiment exp = SystemExperiment(target="x.py", workspace=tmp_path) assert exp.baseline is None with patch("timmy.autoresearch.subprocess.run") as mock_run: mock_run.return_value = MagicMock( returncode=0, stdout="8 passed in 1.23s", stderr="", ) exp.run(max_iterations=1) assert exp.baseline == pytest.approx(100.0) assert exp.results[0]["baseline"] is None # First run has no baseline