[claude] Add timmy learn autoresearch entry point (#907) (#1240)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
2026-03-23 23:14:09 +00:00
parent ea76af068a
commit 6691f4d1f3
5 changed files with 653 additions and 0 deletions
--- a/program.md
+++ b/program.md
@@ -0,0 +1,23 @@
 # Research Direction
 This file guides the `timmy learn` autoresearch loop.  Edit it to focus
 autonomous experiments on a specific goal.
 ## Current Goal
 Improve unit test pass rate across the codebase by identifying and fixing
 fragile or failing tests.
 ## Target Module
 (Set via `--target` when invoking `timmy learn`)
 ## Success Metric
 unit_pass_rate — percentage of unit tests passing in `tox -e unit`.
 ## Notes
 - Experiments run one at a time; each is time-boxed by `--budget`.
 - Improvements are committed automatically; regressions are reverted.
 - Use `--dry-run` to preview hypotheses without making changes.
--- a/src/timmy/autoresearch.py
+++ b/src/timmy/autoresearch.py
@@ -212,3 +212,212 @@ def _append_result(workspace: Path, result: dict[str, Any]) -> None:
    results_file.parent.mkdir(parents=True, exist_ok=True)
    with results_file.open("a") as f:
        f.write(json.dumps(result) + "\n")
 def _extract_pass_rate(output: str) -> float | None:
    """Extract pytest pass rate as a percentage from tox/pytest output."""
    passed_m = re.search(r"(\d+) passed", output)
    failed_m = re.search(r"(\d+) failed", output)
    if passed_m:
        passed = int(passed_m.group(1))
        failed = int(failed_m.group(1)) if failed_m else 0
        total = passed + failed
        return (passed / total * 100.0) if total > 0 else 100.0
    return None
 def _extract_coverage(output: str) -> float | None:
    """Extract total coverage percentage from coverage output."""
    coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
    if coverage_m:
        try:
            return float(coverage_m.group(1))
        except ValueError:
            pass
    return None
 # ── Higher-is-better metric names ────────────────────────────────────────────
 _HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
 class SystemExperiment:
    """An autoresearch experiment targeting a specific module with a configurable metric.
    Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
    for a single target file or module.
    Args:
        target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
        metric: Metric to extract from tox output.  Built-in values:
            ``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
            Any other value is forwarded to :func:`_extract_metric`.
        budget_minutes: Wall-clock budget per experiment (default 5 min).
        workspace: Working directory for subprocess calls.  Defaults to ``cwd``.
    """
    def __init__(
        self,
        target: str,
        metric: str = "unit_pass_rate",
        budget_minutes: int = 5,
        workspace: Path | None = None,
    ) -> None:
        self.target = target
        self.metric = metric
        self.budget_seconds = budget_minutes * 60
        self.workspace = Path(workspace) if workspace else Path.cwd()
    # ── Hypothesis generation ─────────────────────────────────────────────────
    def generate_hypothesis(self, program_content: str = "") -> str:
        """Return a plain-English hypothesis for the next experiment.
        Uses the first non-empty line of *program_content* when available;
        falls back to a generic description based on target and metric.
        """
        first_line = ""
        for line in program_content.splitlines():
            stripped = line.strip()
            if stripped and not stripped.startswith("#"):
                first_line = stripped[:120]
                break
        if first_line:
            return f"[{self.target}] {first_line}"
        return f"Improve {self.metric} for {self.target}"
    # ── Edit phase ────────────────────────────────────────────────────────────
    def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
        """Apply code edits to *target* via Aider.
        Returns a status string.  Degrades gracefully — never raises.
        """
        prompt = f"Edit {self.target}: {hypothesis}"
        try:
            result = subprocess.run(
                ["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
                capture_output=True,
                text=True,
                timeout=self.budget_seconds,
                cwd=str(self.workspace),
            )
            if result.returncode == 0:
                return result.stdout or "Edit applied."
            return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
        except FileNotFoundError:
            logger.warning("Aider not installed — edit skipped")
            return "Aider not available — edit skipped"
        except subprocess.TimeoutExpired:
            logger.warning("Aider timed out after %ds", self.budget_seconds)
            return "Aider timed out"
        except (OSError, subprocess.SubprocessError) as exc:
            logger.warning("Aider failed: %s", exc)
            return f"Edit failed: {exc}"
    # ── Evaluation phase ──────────────────────────────────────────────────────
    def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
        """Run *tox_env* and return a result dict.
        Returns:
            Dict with keys: ``metric`` (float|None), ``log`` (str),
            ``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
        """
        start = time.monotonic()
        try:
            result = subprocess.run(
                ["tox", "-e", tox_env],
                capture_output=True,
                text=True,
                timeout=self.budget_seconds,
                cwd=str(self.workspace),
            )
            duration = int(time.monotonic() - start)
            output = result.stdout + result.stderr
            metric_val = self._extract_tox_metric(output)
            return {
                "metric": metric_val,
                "log": output[-3000:],
                "duration_s": duration,
                "success": result.returncode == 0,
                "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
            }
        except subprocess.TimeoutExpired:
            duration = int(time.monotonic() - start)
            return {
                "metric": None,
                "log": f"Budget exceeded after {self.budget_seconds}s",
                "duration_s": duration,
                "success": False,
                "error": f"Budget exceeded after {self.budget_seconds}s",
            }
        except OSError as exc:
            return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
    def _extract_tox_metric(self, output: str) -> float | None:
        """Dispatch to the correct metric extractor based on *self.metric*."""
        if self.metric == "unit_pass_rate":
            return _extract_pass_rate(output)
        if self.metric == "coverage":
            return _extract_coverage(output)
        return _extract_metric(output, self.metric)
    def evaluate(self, current: float | None, baseline: float | None) -> str:
        """Compare *current* metric against *baseline* and return an assessment."""
        if current is None:
            return "Indeterminate: metric not extracted from output"
        if baseline is None:
            unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
            return f"Baseline: {self.metric} = {current:.2f}{unit}"
        if self.metric in _HIGHER_IS_BETTER:
            delta = current - baseline
            pct = (delta / baseline * 100) if baseline != 0 else 0.0
            if delta > 0:
                return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
            if delta < 0:
                return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
            return f"No change: {self.metric} = {current:.2f}%"
        # lower-is-better (val_bpb, loss, etc.)
        return evaluate_result(current, baseline, self.metric)
    def is_improvement(self, current: float, baseline: float) -> bool:
        """Return True if *current* is better than *baseline* for this metric."""
        if self.metric in _HIGHER_IS_BETTER:
            return current > baseline
        return current < baseline  # lower-is-better
    # ── Git phase ─────────────────────────────────────────────────────────────
    def commit_changes(self, message: str) -> bool:
        """Stage and commit all changes.  Returns True on success."""
        try:
            subprocess.run(
                ["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
            )
            subprocess.run(
                ["git", "commit", "-m", message],
                cwd=str(self.workspace),
                check=True,
                timeout=30,
            )
            return True
        except subprocess.CalledProcessError as exc:
            logger.warning("Git commit failed: %s", exc)
            return False
    def revert_changes(self) -> bool:
        """Revert all uncommitted changes.  Returns True on success."""
        try:
            subprocess.run(
                ["git", "checkout", "--", "."],
                cwd=str(self.workspace),
                check=True,
                timeout=30,
            )
            return True
        except subprocess.CalledProcessError as exc:
            logger.warning("Git revert failed: %s", exc)
            return False
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -527,5 +527,159 @@ def healthcheck(
    raise typer.Exit(result.returncode)
@app.command()
 def learn(
    target: str | None = typer.Option(
        None,
        "--target",
        "-t",
        help="Module or file to optimise (e.g. 'src/timmy/agent.py')",
    ),
    metric: str = typer.Option(
        "unit_pass_rate",
        "--metric",
        "-m",
        help="Metric to track: unit_pass_rate | coverage | val_bpb | <custom>",
    ),
    budget: int = typer.Option(
        5,
        "--budget",
        help="Time limit per experiment in minutes",
    ),
    max_experiments: int = typer.Option(
        10,
        "--max-experiments",
        help="Cap on total experiments per run",
    ),
    dry_run: bool = typer.Option(
        False,
        "--dry-run",
        help="Show hypothesis without executing experiments",
    ),
    program_file: str | None = typer.Option(
        None,
        "--program",
        "-p",
        help="Path to research direction file (default: program.md in cwd)",
    ),
    tox_env: str = typer.Option(
        "unit",
        "--tox-env",
        help="Tox environment to run for each evaluation",
    ),
    model: str = typer.Option(
        "qwen3:30b",
        "--model",
        help="Ollama model forwarded to Aider for code edits",
    ),
 ):
    """Start an autonomous improvement loop (autoresearch).
    Reads program.md for research direction, then iterates:
    hypothesis → edit → tox → evaluate → commit/revert.
    Experiments continue until --max-experiments is reached or the loop is
    interrupted with Ctrl+C.  Use --dry-run to preview hypotheses without
    making any changes.
    Example:
        timmy learn --target src/timmy/agent.py --metric unit_pass_rate
    """
    from pathlib import Path
    from timmy.autoresearch import SystemExperiment
    repo_root = Path.cwd()
    program_path = Path(program_file) if program_file else repo_root / "program.md"
    if program_path.exists():
        program_content = program_path.read_text()
        typer.echo(f"Research direction: {program_path}")
    else:
        program_content = ""
        typer.echo(
            f"Note: {program_path} not found — proceeding without research direction.",
            err=True,
        )
    if target is None:
        typer.echo(
            "Error: --target is required. Specify the module or file to optimise.", err=True
        )
        raise typer.Exit(1)
    experiment = SystemExperiment(
        target=target,
        metric=metric,
        budget_minutes=budget,
    )
    typer.echo()
    typer.echo(typer.style("Autoresearch", bold=True) + f" — {target}")
    typer.echo(f"  metric={metric}  budget={budget}min  max={max_experiments}  tox={tox_env}")
    if dry_run:
        typer.echo("  (dry-run — no changes will be made)")
    typer.echo()
    baseline: float | None = None
    try:
        for i in range(1, max_experiments + 1):
            typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False)
            hypothesis = experiment.generate_hypothesis(program_content)
            typer.echo(f" {hypothesis}")
            if dry_run:
                continue
            # Edit phase
            typer.echo("  → editing …", nl=False)
            edit_result = experiment.apply_edit(hypothesis, model=model)
            if "not available" in edit_result or edit_result.startswith("Aider error"):
                typer.echo(f" skipped ({edit_result.split(':')[0]})")
            else:
                typer.echo(" done")
            # Evaluate phase
            typer.echo("  → running tox …", nl=False)
            tox_result = experiment.run_tox(tox_env=tox_env)
            typer.echo(f" {tox_result['duration_s']}s")
            assessment = experiment.evaluate(tox_result["metric"], baseline)
            typer.echo(f"  → {assessment}")
            if tox_result["metric"] is not None and baseline is None:
                baseline = tox_result["metric"]
            if tox_result["success"] and tox_result["metric"] is not None and baseline is not None:
                if experiment.is_improvement(tox_result["metric"], baseline):
                    commit_msg = (
                        f"autoresearch: improve {metric} on {target} — {assessment}"
                    )
                    if experiment.commit_changes(commit_msg):
                        typer.echo("  → committed")
                        baseline = tox_result["metric"]
                    else:
                        experiment.revert_changes()
                        typer.echo("  → commit failed, reverted")
                else:
                    experiment.revert_changes()
                    typer.echo("  → reverted (no improvement)")
            elif not tox_result["success"]:
                experiment.revert_changes()
                typer.echo(f"  → reverted ({tox_result['error']})")
            typer.echo()
    except KeyboardInterrupt:
        typer.echo("\nInterrupted.")
        raise typer.Exit(0) from None
    typer.echo(typer.style("Autoresearch complete.", bold=True))
    if baseline is not None:
        typer.echo(f"Final {metric}: {baseline:.4f}")
 def main():
    app()
--- a/tests/timmy/test_autoresearch.py
+++ b/tests/timmy/test_autoresearch.py
@@ -176,3 +176,176 @@ class TestExtractMetric:
        output = "loss: 0.45\nloss: 0.32"
        assert _extract_metric(output, "loss") == pytest.approx(0.32)
 class TestExtractPassRate:
    """Tests for _extract_pass_rate()."""
    def test_all_passing(self):
        from timmy.autoresearch import _extract_pass_rate
        output = "5 passed in 1.23s"
        assert _extract_pass_rate(output) == pytest.approx(100.0)
    def test_mixed_results(self):
        from timmy.autoresearch import _extract_pass_rate
        output = "8 passed, 2 failed in 2.00s"
        assert _extract_pass_rate(output) == pytest.approx(80.0)
    def test_no_pytest_output(self):
        from timmy.autoresearch import _extract_pass_rate
        assert _extract_pass_rate("no test results here") is None
 class TestExtractCoverage:
    """Tests for _extract_coverage()."""
    def test_total_line(self):
        from timmy.autoresearch import _extract_coverage
        output = "TOTAL    1234    100    92%"
        assert _extract_coverage(output) == pytest.approx(92.0)
    def test_no_coverage(self):
        from timmy.autoresearch import _extract_coverage
        assert _extract_coverage("no coverage data") is None
 class TestSystemExperiment:
    """Tests for SystemExperiment class."""
    def test_generate_hypothesis_with_program(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="src/timmy/agent.py")
        hyp = exp.generate_hypothesis("Fix memory leak in session handling")
        assert "src/timmy/agent.py" in hyp
        assert "Fix memory leak" in hyp
    def test_generate_hypothesis_fallback(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
        hyp = exp.generate_hypothesis("")
        assert "src/timmy/agent.py" in hyp
        assert "coverage" in hyp
    def test_generate_hypothesis_skips_comment_lines(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="mymodule.py")
        hyp = exp.generate_hypothesis("# comment\nActual direction here")
        assert "Actual direction" in hyp
    def test_evaluate_baseline(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
        result = exp.evaluate(85.0, None)
        assert "Baseline" in result
        assert "85" in result
    def test_evaluate_improvement_higher_is_better(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
        result = exp.evaluate(90.0, 85.0)
        assert "Improvement" in result
    def test_evaluate_regression_higher_is_better(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="coverage")
        result = exp.evaluate(80.0, 85.0)
        assert "Regression" in result
    def test_evaluate_none_metric(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py")
        result = exp.evaluate(None, 80.0)
        assert "Indeterminate" in result
    def test_evaluate_lower_is_better(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="val_bpb")
        result = exp.evaluate(1.1, 1.2)
        assert "Improvement" in result
    def test_is_improvement_higher_is_better(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
        assert exp.is_improvement(90.0, 85.0) is True
        assert exp.is_improvement(80.0, 85.0) is False
    def test_is_improvement_lower_is_better(self):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", metric="val_bpb")
        assert exp.is_improvement(1.1, 1.2) is True
        assert exp.is_improvement(1.3, 1.2) is False
    def test_run_tox_success(self, tmp_path):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", workspace=tmp_path)
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="8 passed in 1.23s",
                stderr="",
            )
            result = exp.run_tox(tox_env="unit")
        assert result["success"] is True
        assert result["metric"] == pytest.approx(100.0)
    def test_run_tox_timeout(self, tmp_path):
        import subprocess
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
            result = exp.run_tox()
        assert result["success"] is False
        assert "Budget exceeded" in result["error"]
    def test_apply_edit_aider_not_installed(self, tmp_path):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", workspace=tmp_path)
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.side_effect = FileNotFoundError("aider not found")
            result = exp.apply_edit("some hypothesis")
        assert "not available" in result
    def test_commit_changes_success(self, tmp_path):
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", workspace=tmp_path)
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(returncode=0)
            success = exp.commit_changes("test commit")
        assert success is True
    def test_revert_changes_failure(self, tmp_path):
        import subprocess
        from timmy.autoresearch import SystemExperiment
        exp = SystemExperiment(target="x.py", workspace=tmp_path)
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.side_effect = subprocess.CalledProcessError(1, "git")
            success = exp.revert_changes()
        assert success is False
--- a/tests/timmy/test_cli_learn.py
+++ b/tests/timmy/test_cli_learn.py
@@ -0,0 +1,94 @@
 """Tests for the `timmy learn` CLI command (autoresearch entry point)."""
 from unittest.mock import MagicMock, patch
 from typer.testing import CliRunner
 from timmy.cli import app
 runner = CliRunner()
 class TestLearnCommand:
    """Tests for `timmy learn`."""
    def test_requires_target(self):
        result = runner.invoke(app, ["learn"])
        assert result.exit_code != 0
        assert "target" in result.output.lower() or "target" in (result.stderr or "").lower()
    def test_dry_run_shows_hypothesis_no_tox(self, tmp_path):
        program_file = tmp_path / "program.md"
        program_file.write_text("Improve logging coverage in agent module")
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            result = runner.invoke(
                app,
                [
                    "learn",
                    "--target",
                    "src/timmy/agent.py",
                    "--program",
                    str(program_file),
                    "--max-experiments",
                    "2",
                    "--dry-run",
                ],
            )
        assert result.exit_code == 0
        # tox should never be called in dry-run
        mock_run.assert_not_called()
        assert "agent.py" in result.output
    def test_missing_program_md_warns_but_continues(self, tmp_path):
        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(returncode=0, stdout="3 passed", stderr="")
            result = runner.invoke(
                app,
                [
                    "learn",
                    "--target",
                    "src/timmy/agent.py",
                    "--program",
                    str(tmp_path / "nonexistent.md"),
                    "--max-experiments",
                    "1",
                    "--dry-run",
                ],
            )
        assert result.exit_code == 0
    def test_dry_run_prints_max_experiments_hypotheses(self, tmp_path):
        program_file = tmp_path / "program.md"
        program_file.write_text("Fix edge case in parser")
        result = runner.invoke(
            app,
            [
                "learn",
                "--target",
                "src/timmy/parser.py",
                "--program",
                str(program_file),
                "--max-experiments",
                "3",
                "--dry-run",
            ],
        )
        assert result.exit_code == 0
        # Should show 3 experiment headers
        assert result.output.count("[1/3]") == 1
        assert result.output.count("[2/3]") == 1
        assert result.output.count("[3/3]") == 1
    def test_help_text_present(self):
        result = runner.invoke(app, ["learn", "--help"])
        assert result.exit_code == 0
        assert "--target" in result.output
        assert "--metric" in result.output
        assert "--budget" in result.output
        assert "--max-experiments" in result.output
        assert "--dry-run" in result.output