From 6691f4d1f31b06b046b895ad5d12b3bfb65bd805 Mon Sep 17 00:00:00 2001 From: "Claude (Opus 4.6)" Date: Mon, 23 Mar 2026 23:14:09 +0000 Subject: [PATCH] [claude] Add timmy learn autoresearch entry point (#907) (#1240) Co-authored-by: Claude (Opus 4.6) Co-committed-by: Claude (Opus 4.6) --- program.md | 23 ++++ src/timmy/autoresearch.py | 209 +++++++++++++++++++++++++++++++ src/timmy/cli.py | 154 +++++++++++++++++++++++ tests/timmy/test_autoresearch.py | 173 +++++++++++++++++++++++++ tests/timmy/test_cli_learn.py | 94 ++++++++++++++ 5 files changed, 653 insertions(+) create mode 100644 program.md create mode 100644 tests/timmy/test_cli_learn.py diff --git a/program.md b/program.md new file mode 100644 index 00000000..9edab275 --- /dev/null +++ b/program.md @@ -0,0 +1,23 @@ +# Research Direction + +This file guides the `timmy learn` autoresearch loop. Edit it to focus +autonomous experiments on a specific goal. + +## Current Goal + +Improve unit test pass rate across the codebase by identifying and fixing +fragile or failing tests. + +## Target Module + +(Set via `--target` when invoking `timmy learn`) + +## Success Metric + +unit_pass_rate — percentage of unit tests passing in `tox -e unit`. + +## Notes + +- Experiments run one at a time; each is time-boxed by `--budget`. +- Improvements are committed automatically; regressions are reverted. +- Use `--dry-run` to preview hypotheses without making changes. diff --git a/src/timmy/autoresearch.py b/src/timmy/autoresearch.py index 14fbea74..ede95927 100644 --- a/src/timmy/autoresearch.py +++ b/src/timmy/autoresearch.py @@ -212,3 +212,212 @@ def _append_result(workspace: Path, result: dict[str, Any]) -> None: results_file.parent.mkdir(parents=True, exist_ok=True) with results_file.open("a") as f: f.write(json.dumps(result) + "\n") + + +def _extract_pass_rate(output: str) -> float | None: + """Extract pytest pass rate as a percentage from tox/pytest output.""" + passed_m = re.search(r"(\d+) passed", output) + failed_m = re.search(r"(\d+) failed", output) + if passed_m: + passed = int(passed_m.group(1)) + failed = int(failed_m.group(1)) if failed_m else 0 + total = passed + failed + return (passed / total * 100.0) if total > 0 else 100.0 + return None + + +def _extract_coverage(output: str) -> float | None: + """Extract total coverage percentage from coverage output.""" + coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output) + if coverage_m: + try: + return float(coverage_m.group(1)) + except ValueError: + pass + return None + + +# ── Higher-is-better metric names ──────────────────────────────────────────── +_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"}) + + +class SystemExperiment: + """An autoresearch experiment targeting a specific module with a configurable metric. + + Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop + for a single target file or module. + + Args: + target: Path or module name to optimise (e.g. ``src/timmy/agent.py``). + metric: Metric to extract from tox output. Built-in values: + ``unit_pass_rate`` (default), ``coverage``, ``val_bpb``. + Any other value is forwarded to :func:`_extract_metric`. + budget_minutes: Wall-clock budget per experiment (default 5 min). + workspace: Working directory for subprocess calls. Defaults to ``cwd``. + """ + + def __init__( + self, + target: str, + metric: str = "unit_pass_rate", + budget_minutes: int = 5, + workspace: Path | None = None, + ) -> None: + self.target = target + self.metric = metric + self.budget_seconds = budget_minutes * 60 + self.workspace = Path(workspace) if workspace else Path.cwd() + + # ── Hypothesis generation ───────────────────────────────────────────────── + + def generate_hypothesis(self, program_content: str = "") -> str: + """Return a plain-English hypothesis for the next experiment. + + Uses the first non-empty line of *program_content* when available; + falls back to a generic description based on target and metric. + """ + first_line = "" + for line in program_content.splitlines(): + stripped = line.strip() + if stripped and not stripped.startswith("#"): + first_line = stripped[:120] + break + if first_line: + return f"[{self.target}] {first_line}" + return f"Improve {self.metric} for {self.target}" + + # ── Edit phase ──────────────────────────────────────────────────────────── + + def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str: + """Apply code edits to *target* via Aider. + + Returns a status string. Degrades gracefully — never raises. + """ + prompt = f"Edit {self.target}: {hypothesis}" + try: + result = subprocess.run( + ["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt], + capture_output=True, + text=True, + timeout=self.budget_seconds, + cwd=str(self.workspace), + ) + if result.returncode == 0: + return result.stdout or "Edit applied." + return f"Aider error (exit {result.returncode}): {result.stderr[:500]}" + except FileNotFoundError: + logger.warning("Aider not installed — edit skipped") + return "Aider not available — edit skipped" + except subprocess.TimeoutExpired: + logger.warning("Aider timed out after %ds", self.budget_seconds) + return "Aider timed out" + except (OSError, subprocess.SubprocessError) as exc: + logger.warning("Aider failed: %s", exc) + return f"Edit failed: {exc}" + + # ── Evaluation phase ────────────────────────────────────────────────────── + + def run_tox(self, tox_env: str = "unit") -> dict[str, Any]: + """Run *tox_env* and return a result dict. + + Returns: + Dict with keys: ``metric`` (float|None), ``log`` (str), + ``duration_s`` (int), ``success`` (bool), ``error`` (str|None). + """ + start = time.monotonic() + try: + result = subprocess.run( + ["tox", "-e", tox_env], + capture_output=True, + text=True, + timeout=self.budget_seconds, + cwd=str(self.workspace), + ) + duration = int(time.monotonic() - start) + output = result.stdout + result.stderr + metric_val = self._extract_tox_metric(output) + return { + "metric": metric_val, + "log": output[-3000:], + "duration_s": duration, + "success": result.returncode == 0, + "error": None if result.returncode == 0 else f"Exit code {result.returncode}", + } + except subprocess.TimeoutExpired: + duration = int(time.monotonic() - start) + return { + "metric": None, + "log": f"Budget exceeded after {self.budget_seconds}s", + "duration_s": duration, + "success": False, + "error": f"Budget exceeded after {self.budget_seconds}s", + } + except OSError as exc: + return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)} + + def _extract_tox_metric(self, output: str) -> float | None: + """Dispatch to the correct metric extractor based on *self.metric*.""" + if self.metric == "unit_pass_rate": + return _extract_pass_rate(output) + if self.metric == "coverage": + return _extract_coverage(output) + return _extract_metric(output, self.metric) + + def evaluate(self, current: float | None, baseline: float | None) -> str: + """Compare *current* metric against *baseline* and return an assessment.""" + if current is None: + return "Indeterminate: metric not extracted from output" + if baseline is None: + unit = "%" if self.metric in _HIGHER_IS_BETTER else "" + return f"Baseline: {self.metric} = {current:.2f}{unit}" + + if self.metric in _HIGHER_IS_BETTER: + delta = current - baseline + pct = (delta / baseline * 100) if baseline != 0 else 0.0 + if delta > 0: + return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)" + if delta < 0: + return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)" + return f"No change: {self.metric} = {current:.2f}%" + + # lower-is-better (val_bpb, loss, etc.) + return evaluate_result(current, baseline, self.metric) + + def is_improvement(self, current: float, baseline: float) -> bool: + """Return True if *current* is better than *baseline* for this metric.""" + if self.metric in _HIGHER_IS_BETTER: + return current > baseline + return current < baseline # lower-is-better + + # ── Git phase ───────────────────────────────────────────────────────────── + + def commit_changes(self, message: str) -> bool: + """Stage and commit all changes. Returns True on success.""" + try: + subprocess.run( + ["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30 + ) + subprocess.run( + ["git", "commit", "-m", message], + cwd=str(self.workspace), + check=True, + timeout=30, + ) + return True + except subprocess.CalledProcessError as exc: + logger.warning("Git commit failed: %s", exc) + return False + + def revert_changes(self) -> bool: + """Revert all uncommitted changes. Returns True on success.""" + try: + subprocess.run( + ["git", "checkout", "--", "."], + cwd=str(self.workspace), + check=True, + timeout=30, + ) + return True + except subprocess.CalledProcessError as exc: + logger.warning("Git revert failed: %s", exc) + return False diff --git a/src/timmy/cli.py b/src/timmy/cli.py index ef65dbfe..f82e99ef 100644 --- a/src/timmy/cli.py +++ b/src/timmy/cli.py @@ -527,5 +527,159 @@ def healthcheck( raise typer.Exit(result.returncode) +@app.command() +def learn( + target: str | None = typer.Option( + None, + "--target", + "-t", + help="Module or file to optimise (e.g. 'src/timmy/agent.py')", + ), + metric: str = typer.Option( + "unit_pass_rate", + "--metric", + "-m", + help="Metric to track: unit_pass_rate | coverage | val_bpb | ", + ), + budget: int = typer.Option( + 5, + "--budget", + help="Time limit per experiment in minutes", + ), + max_experiments: int = typer.Option( + 10, + "--max-experiments", + help="Cap on total experiments per run", + ), + dry_run: bool = typer.Option( + False, + "--dry-run", + help="Show hypothesis without executing experiments", + ), + program_file: str | None = typer.Option( + None, + "--program", + "-p", + help="Path to research direction file (default: program.md in cwd)", + ), + tox_env: str = typer.Option( + "unit", + "--tox-env", + help="Tox environment to run for each evaluation", + ), + model: str = typer.Option( + "qwen3:30b", + "--model", + help="Ollama model forwarded to Aider for code edits", + ), +): + """Start an autonomous improvement loop (autoresearch). + + Reads program.md for research direction, then iterates: + hypothesis → edit → tox → evaluate → commit/revert. + + Experiments continue until --max-experiments is reached or the loop is + interrupted with Ctrl+C. Use --dry-run to preview hypotheses without + making any changes. + + Example: + timmy learn --target src/timmy/agent.py --metric unit_pass_rate + """ + from pathlib import Path + + from timmy.autoresearch import SystemExperiment + + repo_root = Path.cwd() + program_path = Path(program_file) if program_file else repo_root / "program.md" + + if program_path.exists(): + program_content = program_path.read_text() + typer.echo(f"Research direction: {program_path}") + else: + program_content = "" + typer.echo( + f"Note: {program_path} not found — proceeding without research direction.", + err=True, + ) + + if target is None: + typer.echo( + "Error: --target is required. Specify the module or file to optimise.", err=True + ) + raise typer.Exit(1) + + experiment = SystemExperiment( + target=target, + metric=metric, + budget_minutes=budget, + ) + + typer.echo() + typer.echo(typer.style("Autoresearch", bold=True) + f" — {target}") + typer.echo(f" metric={metric} budget={budget}min max={max_experiments} tox={tox_env}") + if dry_run: + typer.echo(" (dry-run — no changes will be made)") + typer.echo() + + baseline: float | None = None + + try: + for i in range(1, max_experiments + 1): + typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False) + + hypothesis = experiment.generate_hypothesis(program_content) + typer.echo(f" {hypothesis}") + + if dry_run: + continue + + # Edit phase + typer.echo(" → editing …", nl=False) + edit_result = experiment.apply_edit(hypothesis, model=model) + if "not available" in edit_result or edit_result.startswith("Aider error"): + typer.echo(f" skipped ({edit_result.split(':')[0]})") + else: + typer.echo(" done") + + # Evaluate phase + typer.echo(" → running tox …", nl=False) + tox_result = experiment.run_tox(tox_env=tox_env) + typer.echo(f" {tox_result['duration_s']}s") + + assessment = experiment.evaluate(tox_result["metric"], baseline) + typer.echo(f" → {assessment}") + + if tox_result["metric"] is not None and baseline is None: + baseline = tox_result["metric"] + + if tox_result["success"] and tox_result["metric"] is not None and baseline is not None: + if experiment.is_improvement(tox_result["metric"], baseline): + commit_msg = ( + f"autoresearch: improve {metric} on {target} — {assessment}" + ) + if experiment.commit_changes(commit_msg): + typer.echo(" → committed") + baseline = tox_result["metric"] + else: + experiment.revert_changes() + typer.echo(" → commit failed, reverted") + else: + experiment.revert_changes() + typer.echo(" → reverted (no improvement)") + elif not tox_result["success"]: + experiment.revert_changes() + typer.echo(f" → reverted ({tox_result['error']})") + + typer.echo() + + except KeyboardInterrupt: + typer.echo("\nInterrupted.") + raise typer.Exit(0) from None + + typer.echo(typer.style("Autoresearch complete.", bold=True)) + if baseline is not None: + typer.echo(f"Final {metric}: {baseline:.4f}") + + def main(): app() diff --git a/tests/timmy/test_autoresearch.py b/tests/timmy/test_autoresearch.py index c762d47f..e5143fda 100644 --- a/tests/timmy/test_autoresearch.py +++ b/tests/timmy/test_autoresearch.py @@ -176,3 +176,176 @@ class TestExtractMetric: output = "loss: 0.45\nloss: 0.32" assert _extract_metric(output, "loss") == pytest.approx(0.32) + + +class TestExtractPassRate: + """Tests for _extract_pass_rate().""" + + def test_all_passing(self): + from timmy.autoresearch import _extract_pass_rate + + output = "5 passed in 1.23s" + assert _extract_pass_rate(output) == pytest.approx(100.0) + + def test_mixed_results(self): + from timmy.autoresearch import _extract_pass_rate + + output = "8 passed, 2 failed in 2.00s" + assert _extract_pass_rate(output) == pytest.approx(80.0) + + def test_no_pytest_output(self): + from timmy.autoresearch import _extract_pass_rate + + assert _extract_pass_rate("no test results here") is None + + +class TestExtractCoverage: + """Tests for _extract_coverage().""" + + def test_total_line(self): + from timmy.autoresearch import _extract_coverage + + output = "TOTAL 1234 100 92%" + assert _extract_coverage(output) == pytest.approx(92.0) + + def test_no_coverage(self): + from timmy.autoresearch import _extract_coverage + + assert _extract_coverage("no coverage data") is None + + +class TestSystemExperiment: + """Tests for SystemExperiment class.""" + + def test_generate_hypothesis_with_program(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="src/timmy/agent.py") + hyp = exp.generate_hypothesis("Fix memory leak in session handling") + assert "src/timmy/agent.py" in hyp + assert "Fix memory leak" in hyp + + def test_generate_hypothesis_fallback(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage") + hyp = exp.generate_hypothesis("") + assert "src/timmy/agent.py" in hyp + assert "coverage" in hyp + + def test_generate_hypothesis_skips_comment_lines(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="mymodule.py") + hyp = exp.generate_hypothesis("# comment\nActual direction here") + assert "Actual direction" in hyp + + def test_evaluate_baseline(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="unit_pass_rate") + result = exp.evaluate(85.0, None) + assert "Baseline" in result + assert "85" in result + + def test_evaluate_improvement_higher_is_better(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="unit_pass_rate") + result = exp.evaluate(90.0, 85.0) + assert "Improvement" in result + + def test_evaluate_regression_higher_is_better(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="coverage") + result = exp.evaluate(80.0, 85.0) + assert "Regression" in result + + def test_evaluate_none_metric(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py") + result = exp.evaluate(None, 80.0) + assert "Indeterminate" in result + + def test_evaluate_lower_is_better(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="val_bpb") + result = exp.evaluate(1.1, 1.2) + assert "Improvement" in result + + def test_is_improvement_higher_is_better(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="unit_pass_rate") + assert exp.is_improvement(90.0, 85.0) is True + assert exp.is_improvement(80.0, 85.0) is False + + def test_is_improvement_lower_is_better(self): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", metric="val_bpb") + assert exp.is_improvement(1.1, 1.2) is True + assert exp.is_improvement(1.3, 1.2) is False + + def test_run_tox_success(self, tmp_path): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", workspace=tmp_path) + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock( + returncode=0, + stdout="8 passed in 1.23s", + stderr="", + ) + result = exp.run_tox(tox_env="unit") + + assert result["success"] is True + assert result["metric"] == pytest.approx(100.0) + + def test_run_tox_timeout(self, tmp_path): + import subprocess + + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path) + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60) + result = exp.run_tox() + + assert result["success"] is False + assert "Budget exceeded" in result["error"] + + def test_apply_edit_aider_not_installed(self, tmp_path): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", workspace=tmp_path) + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.side_effect = FileNotFoundError("aider not found") + result = exp.apply_edit("some hypothesis") + + assert "not available" in result + + def test_commit_changes_success(self, tmp_path): + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", workspace=tmp_path) + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0) + success = exp.commit_changes("test commit") + + assert success is True + + def test_revert_changes_failure(self, tmp_path): + import subprocess + + from timmy.autoresearch import SystemExperiment + + exp = SystemExperiment(target="x.py", workspace=tmp_path) + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.side_effect = subprocess.CalledProcessError(1, "git") + success = exp.revert_changes() + + assert success is False diff --git a/tests/timmy/test_cli_learn.py b/tests/timmy/test_cli_learn.py new file mode 100644 index 00000000..f42002bf --- /dev/null +++ b/tests/timmy/test_cli_learn.py @@ -0,0 +1,94 @@ +"""Tests for the `timmy learn` CLI command (autoresearch entry point).""" + +from unittest.mock import MagicMock, patch + +from typer.testing import CliRunner + +from timmy.cli import app + +runner = CliRunner() + + +class TestLearnCommand: + """Tests for `timmy learn`.""" + + def test_requires_target(self): + result = runner.invoke(app, ["learn"]) + assert result.exit_code != 0 + assert "target" in result.output.lower() or "target" in (result.stderr or "").lower() + + def test_dry_run_shows_hypothesis_no_tox(self, tmp_path): + program_file = tmp_path / "program.md" + program_file.write_text("Improve logging coverage in agent module") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + result = runner.invoke( + app, + [ + "learn", + "--target", + "src/timmy/agent.py", + "--program", + str(program_file), + "--max-experiments", + "2", + "--dry-run", + ], + ) + + assert result.exit_code == 0 + # tox should never be called in dry-run + mock_run.assert_not_called() + assert "agent.py" in result.output + + def test_missing_program_md_warns_but_continues(self, tmp_path): + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="3 passed", stderr="") + result = runner.invoke( + app, + [ + "learn", + "--target", + "src/timmy/agent.py", + "--program", + str(tmp_path / "nonexistent.md"), + "--max-experiments", + "1", + "--dry-run", + ], + ) + + assert result.exit_code == 0 + + def test_dry_run_prints_max_experiments_hypotheses(self, tmp_path): + program_file = tmp_path / "program.md" + program_file.write_text("Fix edge case in parser") + + result = runner.invoke( + app, + [ + "learn", + "--target", + "src/timmy/parser.py", + "--program", + str(program_file), + "--max-experiments", + "3", + "--dry-run", + ], + ) + + assert result.exit_code == 0 + # Should show 3 experiment headers + assert result.output.count("[1/3]") == 1 + assert result.output.count("[2/3]") == 1 + assert result.output.count("[3/3]") == 1 + + def test_help_text_present(self): + result = runner.invoke(app, ["learn", "--help"]) + assert result.exit_code == 0 + assert "--target" in result.output + assert "--metric" in result.output + assert "--budget" in result.output + assert "--max-experiments" in result.output + assert "--dry-run" in result.output