From 6691f4d1f31b06b046b895ad5d12b3bfb65bd805 Mon Sep 17 00:00:00 2001
From: "Claude (Opus 4.6)" <claude@hermes.local>
Date: Mon, 23 Mar 2026 23:14:09 +0000
Subject: [PATCH] [claude] Add timmy learn autoresearch entry point (#907)
 (#1240)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
---
 program.md                       |  23 ++++
 src/timmy/autoresearch.py        | 209 +++++++++++++++++++++++++++++++
 src/timmy/cli.py                 | 154 +++++++++++++++++++++++
 tests/timmy/test_autoresearch.py | 173 +++++++++++++++++++++++++
 tests/timmy/test_cli_learn.py    |  94 ++++++++++++++
 5 files changed, 653 insertions(+)
 create mode 100644 program.md
 create mode 100644 tests/timmy/test_cli_learn.py

diff --git a/program.md b/program.md
new file mode 100644
index 00000000..9edab275
--- /dev/null
+++ b/program.md
@@ -0,0 +1,23 @@
+# Research Direction
+
+This file guides the `timmy learn` autoresearch loop.  Edit it to focus
+autonomous experiments on a specific goal.
+
+## Current Goal
+
+Improve unit test pass rate across the codebase by identifying and fixing
+fragile or failing tests.
+
+## Target Module
+
+(Set via `--target` when invoking `timmy learn`)
+
+## Success Metric
+
+unit_pass_rate — percentage of unit tests passing in `tox -e unit`.
+
+## Notes
+
+- Experiments run one at a time; each is time-boxed by `--budget`.
+- Improvements are committed automatically; regressions are reverted.
+- Use `--dry-run` to preview hypotheses without making changes.
diff --git a/src/timmy/autoresearch.py b/src/timmy/autoresearch.py
index 14fbea74..ede95927 100644
--- a/src/timmy/autoresearch.py
+++ b/src/timmy/autoresearch.py
@@ -212,3 +212,212 @@ def _append_result(workspace: Path, result: dict[str, Any]) -> None:
     results_file.parent.mkdir(parents=True, exist_ok=True)
     with results_file.open("a") as f:
         f.write(json.dumps(result) + "\n")
+
+
+def _extract_pass_rate(output: str) -> float | None:
+    """Extract pytest pass rate as a percentage from tox/pytest output."""
+    passed_m = re.search(r"(\d+) passed", output)
+    failed_m = re.search(r"(\d+) failed", output)
+    if passed_m:
+        passed = int(passed_m.group(1))
+        failed = int(failed_m.group(1)) if failed_m else 0
+        total = passed + failed
+        return (passed / total * 100.0) if total > 0 else 100.0
+    return None
+
+
+def _extract_coverage(output: str) -> float | None:
+    """Extract total coverage percentage from coverage output."""
+    coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
+    if coverage_m:
+        try:
+            return float(coverage_m.group(1))
+        except ValueError:
+            pass
+    return None
+
+
+# ── Higher-is-better metric names ────────────────────────────────────────────
+_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
+
+
+class SystemExperiment:
+    """An autoresearch experiment targeting a specific module with a configurable metric.
+
+    Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
+    for a single target file or module.
+
+    Args:
+        target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
+        metric: Metric to extract from tox output.  Built-in values:
+            ``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
+            Any other value is forwarded to :func:`_extract_metric`.
+        budget_minutes: Wall-clock budget per experiment (default 5 min).
+        workspace: Working directory for subprocess calls.  Defaults to ``cwd``.
+    """
+
+    def __init__(
+        self,
+        target: str,
+        metric: str = "unit_pass_rate",
+        budget_minutes: int = 5,
+        workspace: Path | None = None,
+    ) -> None:
+        self.target = target
+        self.metric = metric
+        self.budget_seconds = budget_minutes * 60
+        self.workspace = Path(workspace) if workspace else Path.cwd()
+
+    # ── Hypothesis generation ─────────────────────────────────────────────────
+
+    def generate_hypothesis(self, program_content: str = "") -> str:
+        """Return a plain-English hypothesis for the next experiment.
+
+        Uses the first non-empty line of *program_content* when available;
+        falls back to a generic description based on target and metric.
+        """
+        first_line = ""
+        for line in program_content.splitlines():
+            stripped = line.strip()
+            if stripped and not stripped.startswith("#"):
+                first_line = stripped[:120]
+                break
+        if first_line:
+            return f"[{self.target}] {first_line}"
+        return f"Improve {self.metric} for {self.target}"
+
+    # ── Edit phase ────────────────────────────────────────────────────────────
+
+    def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
+        """Apply code edits to *target* via Aider.
+
+        Returns a status string.  Degrades gracefully — never raises.
+        """
+        prompt = f"Edit {self.target}: {hypothesis}"
+        try:
+            result = subprocess.run(
+                ["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
+                capture_output=True,
+                text=True,
+                timeout=self.budget_seconds,
+                cwd=str(self.workspace),
+            )
+            if result.returncode == 0:
+                return result.stdout or "Edit applied."
+            return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
+        except FileNotFoundError:
+            logger.warning("Aider not installed — edit skipped")
+            return "Aider not available — edit skipped"
+        except subprocess.TimeoutExpired:
+            logger.warning("Aider timed out after %ds", self.budget_seconds)
+            return "Aider timed out"
+        except (OSError, subprocess.SubprocessError) as exc:
+            logger.warning("Aider failed: %s", exc)
+            return f"Edit failed: {exc}"
+
+    # ── Evaluation phase ──────────────────────────────────────────────────────
+
+    def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
+        """Run *tox_env* and return a result dict.
+
+        Returns:
+            Dict with keys: ``metric`` (float|None), ``log`` (str),
+            ``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
+        """
+        start = time.monotonic()
+        try:
+            result = subprocess.run(
+                ["tox", "-e", tox_env],
+                capture_output=True,
+                text=True,
+                timeout=self.budget_seconds,
+                cwd=str(self.workspace),
+            )
+            duration = int(time.monotonic() - start)
+            output = result.stdout + result.stderr
+            metric_val = self._extract_tox_metric(output)
+            return {
+                "metric": metric_val,
+                "log": output[-3000:],
+                "duration_s": duration,
+                "success": result.returncode == 0,
+                "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
+            }
+        except subprocess.TimeoutExpired:
+            duration = int(time.monotonic() - start)
+            return {
+                "metric": None,
+                "log": f"Budget exceeded after {self.budget_seconds}s",
+                "duration_s": duration,
+                "success": False,
+                "error": f"Budget exceeded after {self.budget_seconds}s",
+            }
+        except OSError as exc:
+            return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
+
+    def _extract_tox_metric(self, output: str) -> float | None:
+        """Dispatch to the correct metric extractor based on *self.metric*."""
+        if self.metric == "unit_pass_rate":
+            return _extract_pass_rate(output)
+        if self.metric == "coverage":
+            return _extract_coverage(output)
+        return _extract_metric(output, self.metric)
+
+    def evaluate(self, current: float | None, baseline: float | None) -> str:
+        """Compare *current* metric against *baseline* and return an assessment."""
+        if current is None:
+            return "Indeterminate: metric not extracted from output"
+        if baseline is None:
+            unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
+            return f"Baseline: {self.metric} = {current:.2f}{unit}"
+
+        if self.metric in _HIGHER_IS_BETTER:
+            delta = current - baseline
+            pct = (delta / baseline * 100) if baseline != 0 else 0.0
+            if delta > 0:
+                return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
+            if delta < 0:
+                return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
+            return f"No change: {self.metric} = {current:.2f}%"
+
+        # lower-is-better (val_bpb, loss, etc.)
+        return evaluate_result(current, baseline, self.metric)
+
+    def is_improvement(self, current: float, baseline: float) -> bool:
+        """Return True if *current* is better than *baseline* for this metric."""
+        if self.metric in _HIGHER_IS_BETTER:
+            return current > baseline
+        return current < baseline  # lower-is-better
+
+    # ── Git phase ─────────────────────────────────────────────────────────────
+
+    def commit_changes(self, message: str) -> bool:
+        """Stage and commit all changes.  Returns True on success."""
+        try:
+            subprocess.run(
+                ["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
+            )
+            subprocess.run(
+                ["git", "commit", "-m", message],
+                cwd=str(self.workspace),
+                check=True,
+                timeout=30,
+            )
+            return True
+        except subprocess.CalledProcessError as exc:
+            logger.warning("Git commit failed: %s", exc)
+            return False
+
+    def revert_changes(self) -> bool:
+        """Revert all uncommitted changes.  Returns True on success."""
+        try:
+            subprocess.run(
+                ["git", "checkout", "--", "."],
+                cwd=str(self.workspace),
+                check=True,
+                timeout=30,
+            )
+            return True
+        except subprocess.CalledProcessError as exc:
+            logger.warning("Git revert failed: %s", exc)
+            return False
diff --git a/src/timmy/cli.py b/src/timmy/cli.py
index ef65dbfe..f82e99ef 100644
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -527,5 +527,159 @@ def healthcheck(
     raise typer.Exit(result.returncode)
 
 
+@app.command()
+def learn(
+    target: str | None = typer.Option(
+        None,
+        "--target",
+        "-t",
+        help="Module or file to optimise (e.g. 'src/timmy/agent.py')",
+    ),
+    metric: str = typer.Option(
+        "unit_pass_rate",
+        "--metric",
+        "-m",
+        help="Metric to track: unit_pass_rate | coverage | val_bpb | <custom>",
+    ),
+    budget: int = typer.Option(
+        5,
+        "--budget",
+        help="Time limit per experiment in minutes",
+    ),
+    max_experiments: int = typer.Option(
+        10,
+        "--max-experiments",
+        help="Cap on total experiments per run",
+    ),
+    dry_run: bool = typer.Option(
+        False,
+        "--dry-run",
+        help="Show hypothesis without executing experiments",
+    ),
+    program_file: str | None = typer.Option(
+        None,
+        "--program",
+        "-p",
+        help="Path to research direction file (default: program.md in cwd)",
+    ),
+    tox_env: str = typer.Option(
+        "unit",
+        "--tox-env",
+        help="Tox environment to run for each evaluation",
+    ),
+    model: str = typer.Option(
+        "qwen3:30b",
+        "--model",
+        help="Ollama model forwarded to Aider for code edits",
+    ),
+):
+    """Start an autonomous improvement loop (autoresearch).
+
+    Reads program.md for research direction, then iterates:
+    hypothesis → edit → tox → evaluate → commit/revert.
+
+    Experiments continue until --max-experiments is reached or the loop is
+    interrupted with Ctrl+C.  Use --dry-run to preview hypotheses without
+    making any changes.
+
+    Example:
+        timmy learn --target src/timmy/agent.py --metric unit_pass_rate
+    """
+    from pathlib import Path
+
+    from timmy.autoresearch import SystemExperiment
+
+    repo_root = Path.cwd()
+    program_path = Path(program_file) if program_file else repo_root / "program.md"
+
+    if program_path.exists():
+        program_content = program_path.read_text()
+        typer.echo(f"Research direction: {program_path}")
+    else:
+        program_content = ""
+        typer.echo(
+            f"Note: {program_path} not found — proceeding without research direction.",
+            err=True,
+        )
+
+    if target is None:
+        typer.echo(
+            "Error: --target is required. Specify the module or file to optimise.", err=True
+        )
+        raise typer.Exit(1)
+
+    experiment = SystemExperiment(
+        target=target,
+        metric=metric,
+        budget_minutes=budget,
+    )
+
+    typer.echo()
+    typer.echo(typer.style("Autoresearch", bold=True) + f" — {target}")
+    typer.echo(f"  metric={metric}  budget={budget}min  max={max_experiments}  tox={tox_env}")
+    if dry_run:
+        typer.echo("  (dry-run — no changes will be made)")
+    typer.echo()
+
+    baseline: float | None = None
+
+    try:
+        for i in range(1, max_experiments + 1):
+            typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False)
+
+            hypothesis = experiment.generate_hypothesis(program_content)
+            typer.echo(f" {hypothesis}")
+
+            if dry_run:
+                continue
+
+            # Edit phase
+            typer.echo("  → editing …", nl=False)
+            edit_result = experiment.apply_edit(hypothesis, model=model)
+            if "not available" in edit_result or edit_result.startswith("Aider error"):
+                typer.echo(f" skipped ({edit_result.split(':')[0]})")
+            else:
+                typer.echo(" done")
+
+            # Evaluate phase
+            typer.echo("  → running tox …", nl=False)
+            tox_result = experiment.run_tox(tox_env=tox_env)
+            typer.echo(f" {tox_result['duration_s']}s")
+
+            assessment = experiment.evaluate(tox_result["metric"], baseline)
+            typer.echo(f"  → {assessment}")
+
+            if tox_result["metric"] is not None and baseline is None:
+                baseline = tox_result["metric"]
+
+            if tox_result["success"] and tox_result["metric"] is not None and baseline is not None:
+                if experiment.is_improvement(tox_result["metric"], baseline):
+                    commit_msg = (
+                        f"autoresearch: improve {metric} on {target} — {assessment}"
+                    )
+                    if experiment.commit_changes(commit_msg):
+                        typer.echo("  → committed")
+                        baseline = tox_result["metric"]
+                    else:
+                        experiment.revert_changes()
+                        typer.echo("  → commit failed, reverted")
+                else:
+                    experiment.revert_changes()
+                    typer.echo("  → reverted (no improvement)")
+            elif not tox_result["success"]:
+                experiment.revert_changes()
+                typer.echo(f"  → reverted ({tox_result['error']})")
+
+            typer.echo()
+
+    except KeyboardInterrupt:
+        typer.echo("\nInterrupted.")
+        raise typer.Exit(0) from None
+
+    typer.echo(typer.style("Autoresearch complete.", bold=True))
+    if baseline is not None:
+        typer.echo(f"Final {metric}: {baseline:.4f}")
+
+
 def main():
     app()
diff --git a/tests/timmy/test_autoresearch.py b/tests/timmy/test_autoresearch.py
index c762d47f..e5143fda 100644
--- a/tests/timmy/test_autoresearch.py
+++ b/tests/timmy/test_autoresearch.py
@@ -176,3 +176,176 @@ class TestExtractMetric:
 
         output = "loss: 0.45\nloss: 0.32"
         assert _extract_metric(output, "loss") == pytest.approx(0.32)
+
+
+class TestExtractPassRate:
+    """Tests for _extract_pass_rate()."""
+
+    def test_all_passing(self):
+        from timmy.autoresearch import _extract_pass_rate
+
+        output = "5 passed in 1.23s"
+        assert _extract_pass_rate(output) == pytest.approx(100.0)
+
+    def test_mixed_results(self):
+        from timmy.autoresearch import _extract_pass_rate
+
+        output = "8 passed, 2 failed in 2.00s"
+        assert _extract_pass_rate(output) == pytest.approx(80.0)
+
+    def test_no_pytest_output(self):
+        from timmy.autoresearch import _extract_pass_rate
+
+        assert _extract_pass_rate("no test results here") is None
+
+
+class TestExtractCoverage:
+    """Tests for _extract_coverage()."""
+
+    def test_total_line(self):
+        from timmy.autoresearch import _extract_coverage
+
+        output = "TOTAL    1234    100    92%"
+        assert _extract_coverage(output) == pytest.approx(92.0)
+
+    def test_no_coverage(self):
+        from timmy.autoresearch import _extract_coverage
+
+        assert _extract_coverage("no coverage data") is None
+
+
+class TestSystemExperiment:
+    """Tests for SystemExperiment class."""
+
+    def test_generate_hypothesis_with_program(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="src/timmy/agent.py")
+        hyp = exp.generate_hypothesis("Fix memory leak in session handling")
+        assert "src/timmy/agent.py" in hyp
+        assert "Fix memory leak" in hyp
+
+    def test_generate_hypothesis_fallback(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
+        hyp = exp.generate_hypothesis("")
+        assert "src/timmy/agent.py" in hyp
+        assert "coverage" in hyp
+
+    def test_generate_hypothesis_skips_comment_lines(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="mymodule.py")
+        hyp = exp.generate_hypothesis("# comment\nActual direction here")
+        assert "Actual direction" in hyp
+
+    def test_evaluate_baseline(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
+        result = exp.evaluate(85.0, None)
+        assert "Baseline" in result
+        assert "85" in result
+
+    def test_evaluate_improvement_higher_is_better(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
+        result = exp.evaluate(90.0, 85.0)
+        assert "Improvement" in result
+
+    def test_evaluate_regression_higher_is_better(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="coverage")
+        result = exp.evaluate(80.0, 85.0)
+        assert "Regression" in result
+
+    def test_evaluate_none_metric(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py")
+        result = exp.evaluate(None, 80.0)
+        assert "Indeterminate" in result
+
+    def test_evaluate_lower_is_better(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="val_bpb")
+        result = exp.evaluate(1.1, 1.2)
+        assert "Improvement" in result
+
+    def test_is_improvement_higher_is_better(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
+        assert exp.is_improvement(90.0, 85.0) is True
+        assert exp.is_improvement(80.0, 85.0) is False
+
+    def test_is_improvement_lower_is_better(self):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", metric="val_bpb")
+        assert exp.is_improvement(1.1, 1.2) is True
+        assert exp.is_improvement(1.3, 1.2) is False
+
+    def test_run_tox_success(self, tmp_path):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="8 passed in 1.23s",
+                stderr="",
+            )
+            result = exp.run_tox(tox_env="unit")
+
+        assert result["success"] is True
+        assert result["metric"] == pytest.approx(100.0)
+
+    def test_run_tox_timeout(self, tmp_path):
+        import subprocess
+
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
+            result = exp.run_tox()
+
+        assert result["success"] is False
+        assert "Budget exceeded" in result["error"]
+
+    def test_apply_edit_aider_not_installed(self, tmp_path):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.side_effect = FileNotFoundError("aider not found")
+            result = exp.apply_edit("some hypothesis")
+
+        assert "not available" in result
+
+    def test_commit_changes_success(self, tmp_path):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0)
+            success = exp.commit_changes("test commit")
+
+        assert success is True
+
+    def test_revert_changes_failure(self, tmp_path):
+        import subprocess
+
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.side_effect = subprocess.CalledProcessError(1, "git")
+            success = exp.revert_changes()
+
+        assert success is False
diff --git a/tests/timmy/test_cli_learn.py b/tests/timmy/test_cli_learn.py
new file mode 100644
index 00000000..f42002bf
--- /dev/null
+++ b/tests/timmy/test_cli_learn.py
@@ -0,0 +1,94 @@
+"""Tests for the `timmy learn` CLI command (autoresearch entry point)."""
+
+from unittest.mock import MagicMock, patch
+
+from typer.testing import CliRunner
+
+from timmy.cli import app
+
+runner = CliRunner()
+
+
+class TestLearnCommand:
+    """Tests for `timmy learn`."""
+
+    def test_requires_target(self):
+        result = runner.invoke(app, ["learn"])
+        assert result.exit_code != 0
+        assert "target" in result.output.lower() or "target" in (result.stderr or "").lower()
+
+    def test_dry_run_shows_hypothesis_no_tox(self, tmp_path):
+        program_file = tmp_path / "program.md"
+        program_file.write_text("Improve logging coverage in agent module")
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            result = runner.invoke(
+                app,
+                [
+                    "learn",
+                    "--target",
+                    "src/timmy/agent.py",
+                    "--program",
+                    str(program_file),
+                    "--max-experiments",
+                    "2",
+                    "--dry-run",
+                ],
+            )
+
+        assert result.exit_code == 0
+        # tox should never be called in dry-run
+        mock_run.assert_not_called()
+        assert "agent.py" in result.output
+
+    def test_missing_program_md_warns_but_continues(self, tmp_path):
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0, stdout="3 passed", stderr="")
+            result = runner.invoke(
+                app,
+                [
+                    "learn",
+                    "--target",
+                    "src/timmy/agent.py",
+                    "--program",
+                    str(tmp_path / "nonexistent.md"),
+                    "--max-experiments",
+                    "1",
+                    "--dry-run",
+                ],
+            )
+
+        assert result.exit_code == 0
+
+    def test_dry_run_prints_max_experiments_hypotheses(self, tmp_path):
+        program_file = tmp_path / "program.md"
+        program_file.write_text("Fix edge case in parser")
+
+        result = runner.invoke(
+            app,
+            [
+                "learn",
+                "--target",
+                "src/timmy/parser.py",
+                "--program",
+                str(program_file),
+                "--max-experiments",
+                "3",
+                "--dry-run",
+            ],
+        )
+
+        assert result.exit_code == 0
+        # Should show 3 experiment headers
+        assert result.output.count("[1/3]") == 1
+        assert result.output.count("[2/3]") == 1
+        assert result.output.count("[3/3]") == 1
+
+    def test_help_text_present(self):
+        result = runner.invoke(app, ["learn", "--help"])
+        assert result.exit_code == 0
+        assert "--target" in result.output
+        assert "--metric" in result.output
+        assert "--budget" in result.output
+        assert "--max-experiments" in result.output
+        assert "--dry-run" in result.output