Files
Timmy-time-dashboard/tests/timmy/test_autoresearch.py
Alexander Whitestone f0657000ab
Some checks failed
Tests / lint (pull_request) Failing after 17s
Tests / test (pull_request) Has been skipped
feat: add timmy learn CLI command and SystemExperiment class
Implements the autoresearch entry point requested in #907:

- `SystemExperiment` class in autoresearch.py encapsulates the
  hypothesis → edit → tox → evaluate → commit/revert loop with
  configurable target, metric, and budget
- `timmy learn` CLI command with --target, --metric, --budget,
  --max-experiments, --dry-run, --tox-env, and --model flags
- Helper functions _extract_pass_rate() and _extract_coverage()
  for built-in metric extraction from tox/pytest output
- program.md template in repo root for research direction
- Unit tests for SystemExperiment and the learn CLI command

Fixes #907

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 19:06:27 -04:00

352 lines
12 KiB
Python

"""Tests for the autoresearch module — autonomous ML experiment loops."""
import json
from unittest.mock import MagicMock, patch
import pytest
class TestPrepareExperiment:
"""Tests for prepare_experiment()."""
def test_clones_repo_when_not_present(self, tmp_path):
from timmy.autoresearch import prepare_experiment
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
prepare_experiment(tmp_path, "https://example.com/repo.git")
assert mock_run.call_count >= 1
clone_call = mock_run.call_args_list[0]
assert "git" in clone_call.args[0]
assert "clone" in clone_call.args[0]
def test_skips_clone_when_present(self, tmp_path):
from timmy.autoresearch import prepare_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
prepare_experiment(tmp_path)
# Should not call git clone
if mock_run.called:
assert "clone" not in str(mock_run.call_args_list[0])
def test_clone_failure_returns_error(self, tmp_path):
from timmy.autoresearch import prepare_experiment
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed")
result = prepare_experiment(tmp_path)
assert "failed" in result.lower()
class TestRunExperiment:
"""Tests for run_experiment()."""
def test_successful_run_extracts_metric(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="step 1000 val_bpb: 1.2345\nstep 2000 val_bpb: 1.1234",
stderr="",
)
result = run_experiment(tmp_path, timeout=60)
assert result["success"] is True
assert result["metric"] == pytest.approx(1.1234)
assert result["error"] is None
def test_timeout_returns_error(self, tmp_path):
import subprocess
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(cmd="python", timeout=5)
result = run_experiment(tmp_path, timeout=5)
assert result["success"] is False
assert "timed out" in result["error"].lower()
def test_missing_train_py(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
# No train.py
result = run_experiment(tmp_path)
assert result["success"] is False
assert "not found" in result["error"].lower()
def test_no_metric_in_output(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('done')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="")
result = run_experiment(tmp_path)
assert result["success"] is True
assert result["metric"] is None
class TestEvaluateResult:
"""Tests for evaluate_result()."""
def test_improvement_detected(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.10, 1.20)
assert "improvement" in result.lower()
def test_regression_detected(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.30, 1.20)
assert "regression" in result.lower()
def test_no_change(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.20, 1.20)
assert "no change" in result.lower()
class TestExperimentHistory:
"""Tests for get_experiment_history()."""
def test_empty_workspace(self, tmp_path):
from timmy.autoresearch import get_experiment_history
history = get_experiment_history(tmp_path)
assert history == []
def test_reads_jsonl(self, tmp_path):
from timmy.autoresearch import get_experiment_history
results_file = tmp_path / "results.jsonl"
results_file.write_text(
json.dumps({"metric": 1.2, "success": True})
+ "\n"
+ json.dumps({"metric": 1.1, "success": True})
+ "\n"
)
history = get_experiment_history(tmp_path)
assert len(history) == 2
# Most recent first
assert history[0]["metric"] == 1.1
class TestExtractMetric:
"""Tests for _extract_metric()."""
def test_extracts_last_value(self):
from timmy.autoresearch import _extract_metric
output = "val_bpb: 1.5\nval_bpb: 1.3\nval_bpb: 1.1"
assert _extract_metric(output) == pytest.approx(1.1)
def test_no_match_returns_none(self):
from timmy.autoresearch import _extract_metric
assert _extract_metric("no metrics here") is None
def test_custom_metric_name(self):
from timmy.autoresearch import _extract_metric
output = "loss: 0.45\nloss: 0.32"
assert _extract_metric(output, "loss") == pytest.approx(0.32)
class TestExtractPassRate:
"""Tests for _extract_pass_rate()."""
def test_all_passing(self):
from timmy.autoresearch import _extract_pass_rate
output = "5 passed in 1.23s"
assert _extract_pass_rate(output) == pytest.approx(100.0)
def test_mixed_results(self):
from timmy.autoresearch import _extract_pass_rate
output = "8 passed, 2 failed in 2.00s"
assert _extract_pass_rate(output) == pytest.approx(80.0)
def test_no_pytest_output(self):
from timmy.autoresearch import _extract_pass_rate
assert _extract_pass_rate("no test results here") is None
class TestExtractCoverage:
"""Tests for _extract_coverage()."""
def test_total_line(self):
from timmy.autoresearch import _extract_coverage
output = "TOTAL 1234 100 92%"
assert _extract_coverage(output) == pytest.approx(92.0)
def test_no_coverage(self):
from timmy.autoresearch import _extract_coverage
assert _extract_coverage("no coverage data") is None
class TestSystemExperiment:
"""Tests for SystemExperiment class."""
def test_generate_hypothesis_with_program(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="src/timmy/agent.py")
hyp = exp.generate_hypothesis("Fix memory leak in session handling")
assert "src/timmy/agent.py" in hyp
assert "Fix memory leak" in hyp
def test_generate_hypothesis_fallback(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
hyp = exp.generate_hypothesis("")
assert "src/timmy/agent.py" in hyp
assert "coverage" in hyp
def test_generate_hypothesis_skips_comment_lines(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="mymodule.py")
hyp = exp.generate_hypothesis("# comment\nActual direction here")
assert "Actual direction" in hyp
def test_evaluate_baseline(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
result = exp.evaluate(85.0, None)
assert "Baseline" in result
assert "85" in result
def test_evaluate_improvement_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
result = exp.evaluate(90.0, 85.0)
assert "Improvement" in result
def test_evaluate_regression_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="coverage")
result = exp.evaluate(80.0, 85.0)
assert "Regression" in result
def test_evaluate_none_metric(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py")
result = exp.evaluate(None, 80.0)
assert "Indeterminate" in result
def test_evaluate_lower_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="val_bpb")
result = exp.evaluate(1.1, 1.2)
assert "Improvement" in result
def test_is_improvement_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
assert exp.is_improvement(90.0, 85.0) is True
assert exp.is_improvement(80.0, 85.0) is False
def test_is_improvement_lower_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="val_bpb")
assert exp.is_improvement(1.1, 1.2) is True
assert exp.is_improvement(1.3, 1.2) is False
def test_run_tox_success(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="8 passed in 1.23s",
stderr="",
)
result = exp.run_tox(tox_env="unit")
assert result["success"] is True
assert result["metric"] == pytest.approx(100.0)
def test_run_tox_timeout(self, tmp_path):
import subprocess
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
result = exp.run_tox()
assert result["success"] is False
assert "Budget exceeded" in result["error"]
def test_apply_edit_aider_not_installed(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = FileNotFoundError("aider not found")
result = exp.apply_edit("some hypothesis")
assert "not available" in result
def test_commit_changes_success(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0)
success = exp.commit_changes("test commit")
assert success is True
def test_revert_changes_failure(self, tmp_path):
import subprocess
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
success = exp.revert_changes()
assert success is False