[kimi] Refactor autoresearch.py -> SystemExperiment class (#906) (#1244)

Co-authored-by: Kimi Agent <kimi@timmy.local>
Co-committed-by: Kimi Agent <kimi@timmy.local>
This commit is contained in:
2026-03-23 23:28:54 +00:00
committed by Alexander Whitestone
parent 6691f4d1f3
commit 261b7be468
3 changed files with 380 additions and 65 deletions

View File

@@ -39,7 +39,9 @@ class TestPrepareExperiment:
from timmy.autoresearch import prepare_experiment
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed")
mock_run.return_value = MagicMock(
returncode=1, stdout="", stderr="auth failed"
)
result = prepare_experiment(tmp_path)
assert "failed" in result.lower()
@@ -102,7 +104,9 @@ class TestRunExperiment:
(repo_dir / "train.py").write_text("print('done')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="")
mock_run.return_value = MagicMock(
returncode=0, stdout="no metrics here", stderr=""
)
result = run_experiment(tmp_path)
assert result["success"] is True
@@ -349,3 +353,107 @@ class TestSystemExperiment:
success = exp.revert_changes()
assert success is False
def test_create_branch_success(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0)
success = exp.create_branch("feature/test-branch")
assert success is True
# Verify correct git command was called
mock_run.assert_called_once()
call_args = mock_run.call_args[0][0]
assert "checkout" in call_args
assert "-b" in call_args
assert "feature/test-branch" in call_args
def test_create_branch_failure(self, tmp_path):
import subprocess
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
success = exp.create_branch("feature/test-branch")
assert success is False
def test_run_dry_run_mode(self, tmp_path):
"""Test that run() in dry_run mode only generates hypotheses."""
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
result = exp.run(max_iterations=3, dry_run=True, program_content="Test program")
assert result["iterations"] == 3
assert result["success"] is False # No actual experiments run
assert len(exp.results) == 3
# Each result should have a hypothesis
for record in exp.results:
assert "hypothesis" in record
def test_run_with_custom_metric_fn(self, tmp_path):
"""Test that custom metric_fn is used for metric extraction."""
from timmy.autoresearch import SystemExperiment
def custom_metric_fn(output: str) -> float | None:
match = __import__("re").search(r"custom_metric:\s*([0-9.]+)", output)
return float(match.group(1)) if match else None
exp = SystemExperiment(
target="x.py",
workspace=tmp_path,
metric="custom",
metric_fn=custom_metric_fn,
)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="custom_metric: 42.5\nother output",
stderr="",
)
tox_result = exp.run_tox()
assert tox_result["metric"] == pytest.approx(42.5)
def test_run_single_iteration_success(self, tmp_path):
"""Test a successful single iteration that finds an improvement."""
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
# Mock tox returning a passing test with metric
mock_run.return_value = MagicMock(
returncode=0,
stdout="10 passed in 1.23s",
stderr="",
)
result = exp.run(max_iterations=1, tox_env="unit")
assert result["iterations"] == 1
assert len(exp.results) == 1
assert exp.results[0]["metric"] == pytest.approx(100.0)
def test_run_stores_baseline_on_first_success(self, tmp_path):
"""Test that baseline is set after first successful iteration."""
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
assert exp.baseline is None
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="8 passed in 1.23s",
stderr="",
)
exp.run(max_iterations=1)
assert exp.baseline == pytest.approx(100.0)
assert exp.results[0]["baseline"] is None # First run has no baseline