From b2f970b77cb9a25d6e39cfee291492cd9fc05798 Mon Sep 17 00:00:00 2001
From: kimi <kimi@localhost>
Date: Mon, 23 Mar 2026 19:23:53 -0400
Subject: [PATCH] refactor(autoresearch): refactor autoresearch.py ->
 SystemExperiment class

- Add run() method to SystemExperiment class for full experiment loop
- Add create_branch() method for git branch management
- Add metric_fn parameter for custom metric extraction
- Add revert_on_failure and hypothesis attributes
- Update CLI learn command to use new SystemExperiment.run() method
- Add tests for new functionality (run, create_branch, custom metric_fn)
- Maintain backward compatibility with standalone functions

Refs #906
---
 src/timmy/autoresearch.py        | 196 +++++++++++++++++++++++++++++--
 src/timmy/cli.py                 | 137 ++++++++++++---------
 tests/timmy/test_autoresearch.py | 112 +++++++++++++++++-
 3 files changed, 380 insertions(+), 65 deletions(-)

diff --git a/src/timmy/autoresearch.py b/src/timmy/autoresearch.py
index ede95927..b7013063 100644
--- a/src/timmy/autoresearch.py
+++ b/src/timmy/autoresearch.py
@@ -8,7 +8,7 @@ Flow:
   1. prepare_experiment  — clone repo + run data prep
   2. run_experiment      — execute train.py with wall-clock timeout
   3. evaluate_result     — compare metric against baseline
-  4. experiment_loop     — orchestrate the full cycle
+  4. SystemExperiment    — orchestrate the full cycle via class interface
 
 All subprocess calls are guarded with timeouts for graceful degradation.
 """
@@ -20,6 +20,7 @@ import logging
 import re
 import subprocess
 import time
+from collections.abc import Callable
 from pathlib import Path
 from typing import Any
 
@@ -29,6 +30,10 @@ DEFAULT_REPO = "https://github.com/karpathy/autoresearch.git"
 _METRIC_RE = re.compile(r"val_bpb[:\s]+([0-9]+\.?[0-9]*)")
 
 
+# ── Higher-is-better metric names ────────────────────────────────────────────
+_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
+
+
 def prepare_experiment(
     workspace: Path,
     repo_url: str = DEFAULT_REPO,
@@ -125,7 +130,9 @@ def run_experiment(
             "log": output[-2000:],  # Keep last 2k chars
             "duration_s": duration,
             "success": result.returncode == 0,
-            "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
+            "error": (
+                None if result.returncode == 0 else f"Exit code {result.returncode}"
+            ),
         }
     except subprocess.TimeoutExpired:
         duration = int(time.monotonic() - start)
@@ -179,9 +186,13 @@ def evaluate_result(
     pct = (delta / baseline) * 100 if baseline != 0 else 0.0
 
     if delta < 0:
-        return f"Improvement: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
+        return (
+            f"Improvement: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
+        )
     elif delta > 0:
-        return f"Regression: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
+        return (
+            f"Regression: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
+        )
     else:
         return f"No change: {metric_name} = {current:.4f}"
 
@@ -237,10 +248,6 @@ def _extract_coverage(output: str) -> float | None:
     return None
 
 
-# ── Higher-is-better metric names ────────────────────────────────────────────
-_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
-
-
 class SystemExperiment:
     """An autoresearch experiment targeting a specific module with a configurable metric.
 
@@ -254,6 +261,10 @@ class SystemExperiment:
             Any other value is forwarded to :func:`_extract_metric`.
         budget_minutes: Wall-clock budget per experiment (default 5 min).
         workspace: Working directory for subprocess calls.  Defaults to ``cwd``.
+        revert_on_failure: Whether to revert changes on failed experiments.
+        hypothesis: Optional natural language hypothesis for the experiment.
+        metric_fn: Optional callable for custom metric extraction.
+            If provided, overrides built-in metric extraction.
     """
 
     def __init__(
@@ -262,11 +273,19 @@ class SystemExperiment:
         metric: str = "unit_pass_rate",
         budget_minutes: int = 5,
         workspace: Path | None = None,
+        revert_on_failure: bool = True,
+        hypothesis: str = "",
+        metric_fn: Callable[[str], float | None] | None = None,
     ) -> None:
         self.target = target
         self.metric = metric
         self.budget_seconds = budget_minutes * 60
         self.workspace = Path(workspace) if workspace else Path.cwd()
+        self.revert_on_failure = revert_on_failure
+        self.hypothesis = hypothesis
+        self.metric_fn = metric_fn
+        self.results: list[dict[str, Any]] = []
+        self.baseline: float | None = None
 
     # ── Hypothesis generation ─────────────────────────────────────────────────
 
@@ -341,7 +360,9 @@ class SystemExperiment:
                 "log": output[-3000:],
                 "duration_s": duration,
                 "success": result.returncode == 0,
-                "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
+                "error": (
+                    None if result.returncode == 0 else f"Exit code {result.returncode}"
+                ),
             }
         except subprocess.TimeoutExpired:
             duration = int(time.monotonic() - start)
@@ -353,10 +374,24 @@ class SystemExperiment:
                 "error": f"Budget exceeded after {self.budget_seconds}s",
             }
         except OSError as exc:
-            return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
+            return {
+                "metric": None,
+                "log": "",
+                "duration_s": 0,
+                "success": False,
+                "error": str(exc),
+            }
 
     def _extract_tox_metric(self, output: str) -> float | None:
         """Dispatch to the correct metric extractor based on *self.metric*."""
+        # Use custom metric function if provided
+        if self.metric_fn is not None:
+            try:
+                return self.metric_fn(output)
+            except Exception as exc:
+                logger.warning("Custom metric_fn failed: %s", exc)
+                return None
+
         if self.metric == "unit_pass_rate":
             return _extract_pass_rate(output)
         if self.metric == "coverage":
@@ -391,6 +426,20 @@ class SystemExperiment:
 
     # ── Git phase ─────────────────────────────────────────────────────────────
 
+    def create_branch(self, branch_name: str) -> bool:
+        """Create and checkout a new git branch. Returns True on success."""
+        try:
+            subprocess.run(
+                ["git", "checkout", "-b", branch_name],
+                cwd=str(self.workspace),
+                check=True,
+                timeout=30,
+            )
+            return True
+        except subprocess.CalledProcessError as exc:
+            logger.warning("Git branch creation failed: %s", exc)
+            return False
+
     def commit_changes(self, message: str) -> bool:
         """Stage and commit all changes.  Returns True on success."""
         try:
@@ -421,3 +470,130 @@ class SystemExperiment:
         except subprocess.CalledProcessError as exc:
             logger.warning("Git revert failed: %s", exc)
             return False
+
+    # ── Full experiment loop ──────────────────────────────────────────────────
+
+    def run(
+        self,
+        tox_env: str = "unit",
+        model: str = "qwen3:30b",
+        program_content: str = "",
+        max_iterations: int = 1,
+        dry_run: bool = False,
+        create_branch: bool = False,
+    ) -> dict[str, Any]:
+        """Run the full experiment loop: hypothesis → edit → tox → evaluate → commit/revert.
+
+        This method encapsulates the complete experiment cycle, running multiple
+        iterations until an improvement is found or max_iterations is reached.
+
+        Args:
+            tox_env: Tox environment to run (default "unit").
+            model: Ollama model for Aider edits (default "qwen3:30b").
+            program_content: Research direction for hypothesis generation.
+            max_iterations: Maximum number of experiment iterations.
+            dry_run: If True, only generate hypotheses without making changes.
+            create_branch: If True, create a new git branch for the experiment.
+
+        Returns:
+            Dict with keys: ``success`` (bool), ``final_metric`` (float|None),
+            ``baseline`` (float|None), ``iterations`` (int), ``results`` (list).
+        """
+        if create_branch:
+            branch_name = (
+                f"autoresearch/{self.target.replace('/', '-')}-{int(time.time())}"
+            )
+            self.create_branch(branch_name)
+
+        baseline: float | None = self.baseline
+        final_metric: float | None = None
+        success = False
+
+        for iteration in range(1, max_iterations + 1):
+            logger.info("Experiment iteration %d/%d", iteration, max_iterations)
+
+            # Generate hypothesis
+            hypothesis = self.hypothesis or self.generate_hypothesis(program_content)
+            logger.info("Hypothesis: %s", hypothesis)
+
+            # In dry-run mode, just record the hypothesis and continue
+            if dry_run:
+                result_record = {
+                    "iteration": iteration,
+                    "hypothesis": hypothesis,
+                    "metric": None,
+                    "baseline": baseline,
+                    "assessment": "Dry-run: no changes made",
+                    "success": True,
+                    "duration_s": 0,
+                }
+                self.results.append(result_record)
+                continue
+
+            # Apply edit
+            edit_result = self.apply_edit(hypothesis, model=model)
+            edit_failed = "not available" in edit_result or edit_result.startswith(
+                "Aider error"
+            )
+            if edit_failed:
+                logger.warning("Edit phase failed: %s", edit_result)
+
+            # Run evaluation
+            tox_result = self.run_tox(tox_env=tox_env)
+            metric = tox_result["metric"]
+
+            # Evaluate result
+            assessment = self.evaluate(metric, baseline)
+            logger.info("Assessment: %s", assessment)
+
+            # Store result
+            result_record = {
+                "iteration": iteration,
+                "hypothesis": hypothesis,
+                "metric": metric,
+                "baseline": baseline,
+                "assessment": assessment,
+                "success": tox_result["success"],
+                "duration_s": tox_result["duration_s"],
+            }
+            self.results.append(result_record)
+
+            # Set baseline on first successful run
+            if metric is not None and baseline is None:
+                baseline = metric
+                self.baseline = baseline
+                final_metric = metric
+                continue
+
+            # Determine if we should commit or revert
+            should_commit = False
+            if tox_result["success"] and metric is not None and baseline is not None:
+                if self.is_improvement(metric, baseline):
+                    should_commit = True
+                    final_metric = metric
+                    baseline = metric
+                    self.baseline = baseline
+                    success = True
+
+            if should_commit:
+                commit_msg = f"autoresearch: improve {self.metric} on {self.target}\n\n{hypothesis}"
+                if self.commit_changes(commit_msg):
+                    logger.info("Changes committed")
+                else:
+                    self.revert_changes()
+                    logger.warning("Commit failed, changes reverted")
+            elif self.revert_on_failure:
+                self.revert_changes()
+                logger.info("Changes reverted (no improvement)")
+
+            # Early exit if we found an improvement
+            if success:
+                break
+
+        return {
+            "success": success,
+            "final_metric": final_metric,
+            "baseline": self.baseline,
+            "iterations": len(self.results),
+            "results": self.results,
+        }
diff --git a/src/timmy/cli.py b/src/timmy/cli.py
index f82e99ef..44eb9b79 100644
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -96,10 +96,14 @@ def _decide_autonomous(req, tool_name: str, tool_args: dict) -> None:
         logger.info("AUTO-APPROVED (allowlist): %s", tool_name)
     else:
         req.reject(note="Auto-rejected: not in allowlist")
-        logger.info("AUTO-REJECTED (not allowlisted): %s %s", tool_name, str(tool_args)[:100])
+        logger.info(
+            "AUTO-REJECTED (not allowlisted): %s %s", tool_name, str(tool_args)[:100]
+        )
 
 
-def _handle_tool_confirmation(agent, run_output, session_id: str, *, autonomous: bool = False):
+def _handle_tool_confirmation(
+    agent, run_output, session_id: str, *, autonomous: bool = False
+):
     """Prompt user to approve/reject dangerous tool calls.
 
     When Agno pauses a run because a tool requires confirmation, this
@@ -173,7 +177,9 @@ def think(
 ):
     """Ask Timmy to think carefully about a topic."""
     timmy = create_timmy(backend=backend, session_id=_CLI_SESSION_ID)
-    timmy.print_response(f"Think carefully about: {topic}", stream=True, session_id=_CLI_SESSION_ID)
+    timmy.print_response(
+        f"Think carefully about: {topic}", stream=True, session_id=_CLI_SESSION_ID
+    )
 
 
 def _read_message_input(message: list[str]) -> str:
@@ -246,7 +252,9 @@ def chat(
     timmy = create_timmy(backend=backend, session_id=session_id)
 
     run_output = timmy.run(message_str, stream=False, session_id=session_id)
-    run_output = _handle_tool_confirmation(timmy, run_output, session_id, autonomous=autonomous)
+    run_output = _handle_tool_confirmation(
+        timmy, run_output, session_id, autonomous=autonomous
+    )
 
     content = run_output.content if hasattr(run_output, "content") else str(run_output)
     if content:
@@ -300,7 +308,9 @@ def repl(
                 break
 
             try:
-                response = loop.run_until_complete(chat(user_input, session_id=session_id))
+                response = loop.run_until_complete(
+                    chat(user_input, session_id=session_id)
+                )
                 if response:
                     typer.echo(response)
                     typer.echo()
@@ -347,7 +357,10 @@ def interview(
         # Force agent creation by calling chat once with a warm-up prompt
         try:
             loop.run_until_complete(
-                chat("Hello, Timmy. We're about to start your interview.", session_id="interview")
+                chat(
+                    "Hello, Timmy. We're about to start your interview.",
+                    session_id="interview",
+                )
             )
         except Exception as exc:
             typer.echo(f"Warning: Initialization issue — {exc}", err=True)
@@ -360,7 +373,9 @@ def interview(
 
         typer.echo("Starting interview...\n")
         transcript = run_interview(
-            chat_fn=lambda msg: loop.run_until_complete(chat(msg, session_id="interview")),
+            chat_fn=lambda msg: loop.run_until_complete(
+                chat(msg, session_id="interview")
+            ),
             on_answer=_on_answer,
         )
 
@@ -381,7 +396,9 @@ def interview(
 @app.command()
 def up(
     dev: bool = typer.Option(False, "--dev", help="Enable hot-reload for development"),
-    build: bool = typer.Option(True, "--build/--no-build", help="Rebuild images before starting"),
+    build: bool = typer.Option(
+        True, "--build/--no-build", help="Rebuild images before starting"
+    ),
 ):
     """Start Timmy Time in Docker (dashboard + agents)."""
     cmd = ["docker", "compose"]
@@ -410,13 +427,23 @@ def down():
 @app.command()
 def voice(
     whisper_model: str = typer.Option(
-        "base.en", "--whisper", "-w", help="Whisper model: tiny.en, base.en, small.en, medium.en"
+        "base.en",
+        "--whisper",
+        "-w",
+        help="Whisper model: tiny.en, base.en, small.en, medium.en",
+    ),
+    use_say: bool = typer.Option(
+        False, "--say", help="Use macOS `say` instead of Piper TTS"
     ),
-    use_say: bool = typer.Option(False, "--say", help="Use macOS `say` instead of Piper TTS"),
     threshold: float = typer.Option(
-        0.015, "--threshold", "-t", help="Mic silence threshold (RMS). Lower = more sensitive."
+        0.015,
+        "--threshold",
+        "-t",
+        help="Mic silence threshold (RMS). Lower = more sensitive.",
+    ),
+    silence: float = typer.Option(
+        1.5, "--silence", help="Seconds of silence to end recording"
     ),
-    silence: float = typer.Option(1.5, "--silence", help="Seconds of silence to end recording"),
     backend: str | None = _BACKEND_OPTION,
     model_size: str | None = _MODEL_SIZE_OPTION,
 ):
@@ -457,9 +484,12 @@ def route(
 @app.command()
 def focus(
     topic: str | None = typer.Argument(
-        None, help='Topic to focus on (e.g. "three-phase loop"). Omit to show current focus.'
+        None,
+        help='Topic to focus on (e.g. "three-phase loop"). Omit to show current focus.',
+    ),
+    clear: bool = typer.Option(
+        False, "--clear", "-c", help="Clear focus and return to broad mode"
     ),
-    clear: bool = typer.Option(False, "--clear", "-c", help="Clear focus and return to broad mode"),
 ):
     """Set deep-focus mode on a single problem.
 
@@ -495,7 +525,9 @@ def healthcheck(
     verbose: bool = typer.Option(
         False, "--verbose", "-v", help="Show verbose output including issue details"
     ),
-    quiet: bool = typer.Option(False, "--quiet", "-q", help="Only show status line (no details)"),
+    quiet: bool = typer.Option(
+        False, "--quiet", "-q", help="Only show status line (no details)"
+    ),
 ):
     """Quick health snapshot before coding.
 
@@ -604,7 +636,8 @@ def learn(
 
     if target is None:
         typer.echo(
-            "Error: --target is required. Specify the module or file to optimise.", err=True
+            "Error: --target is required. Specify the module or file to optimise.",
+            err=True,
         )
         raise typer.Exit(1)
 
@@ -616,59 +649,57 @@ def learn(
 
     typer.echo()
     typer.echo(typer.style("Autoresearch", bold=True) + f" — {target}")
-    typer.echo(f"  metric={metric}  budget={budget}min  max={max_experiments}  tox={tox_env}")
+    typer.echo(
+        f"  metric={metric}  budget={budget}min  max={max_experiments}  tox={tox_env}"
+    )
     if dry_run:
         typer.echo("  (dry-run — no changes will be made)")
     typer.echo()
 
-    baseline: float | None = None
+    def _progress_callback(iteration: int, max_iter: int, message: str) -> None:
+        """Print progress updates during experiment iterations."""
+        if iteration > 0:
+            prefix = typer.style(f"[{iteration}/{max_iter}]", bold=True)
+            typer.echo(f"{prefix} {message}")
 
     try:
-        for i in range(1, max_experiments + 1):
-            typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False)
+        # Run the full experiment loop via the SystemExperiment class
+        result = experiment.run(
+            tox_env=tox_env,
+            model=model,
+            program_content=program_content,
+            max_iterations=max_experiments,
+            dry_run=dry_run,
+            create_branch=False,  # CLI mode: work on current branch
+        )
 
-            hypothesis = experiment.generate_hypothesis(program_content)
-            typer.echo(f" {hypothesis}")
+        # Display results for each iteration
+        for i, record in enumerate(experiment.results, 1):
+            _progress_callback(i, max_experiments, record["hypothesis"])
 
             if dry_run:
                 continue
 
-            # Edit phase
+            # Edit phase result
             typer.echo("  → editing …", nl=False)
-            edit_result = experiment.apply_edit(hypothesis, model=model)
-            if "not available" in edit_result or edit_result.startswith("Aider error"):
-                typer.echo(f" skipped ({edit_result.split(':')[0]})")
+            if record.get("edit_failed"):
+                typer.echo(f" skipped ({record.get('edit_result', 'unknown')})")
             else:
                 typer.echo(" done")
 
-            # Evaluate phase
-            typer.echo("  → running tox …", nl=False)
-            tox_result = experiment.run_tox(tox_env=tox_env)
-            typer.echo(f" {tox_result['duration_s']}s")
+            # Evaluate phase result
+            duration = record.get("duration_s", 0)
+            typer.echo(f"  → running tox … {duration}s")
 
-            assessment = experiment.evaluate(tox_result["metric"], baseline)
+            # Assessment
+            assessment = record.get("assessment", "No assessment")
             typer.echo(f"  → {assessment}")
 
-            if tox_result["metric"] is not None and baseline is None:
-                baseline = tox_result["metric"]
-
-            if tox_result["success"] and tox_result["metric"] is not None and baseline is not None:
-                if experiment.is_improvement(tox_result["metric"], baseline):
-                    commit_msg = (
-                        f"autoresearch: improve {metric} on {target} — {assessment}"
-                    )
-                    if experiment.commit_changes(commit_msg):
-                        typer.echo("  → committed")
-                        baseline = tox_result["metric"]
-                    else:
-                        experiment.revert_changes()
-                        typer.echo("  → commit failed, reverted")
-                else:
-                    experiment.revert_changes()
-                    typer.echo("  → reverted (no improvement)")
-            elif not tox_result["success"]:
-                experiment.revert_changes()
-                typer.echo(f"  → reverted ({tox_result['error']})")
+            # Outcome
+            if record.get("committed"):
+                typer.echo("  → committed")
+            elif record.get("reverted"):
+                typer.echo("  → reverted (no improvement)")
 
             typer.echo()
 
@@ -677,8 +708,8 @@ def learn(
         raise typer.Exit(0) from None
 
     typer.echo(typer.style("Autoresearch complete.", bold=True))
-    if baseline is not None:
-        typer.echo(f"Final {metric}: {baseline:.4f}")
+    if result.get("baseline") is not None:
+        typer.echo(f"Final {metric}: {result['baseline']:.4f}")
 
 
 def main():
diff --git a/tests/timmy/test_autoresearch.py b/tests/timmy/test_autoresearch.py
index e5143fda..a4d260fe 100644
--- a/tests/timmy/test_autoresearch.py
+++ b/tests/timmy/test_autoresearch.py
@@ -39,7 +39,9 @@ class TestPrepareExperiment:
         from timmy.autoresearch import prepare_experiment
 
         with patch("timmy.autoresearch.subprocess.run") as mock_run:
-            mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed")
+            mock_run.return_value = MagicMock(
+                returncode=1, stdout="", stderr="auth failed"
+            )
             result = prepare_experiment(tmp_path)
 
         assert "failed" in result.lower()
@@ -102,7 +104,9 @@ class TestRunExperiment:
         (repo_dir / "train.py").write_text("print('done')")
 
         with patch("timmy.autoresearch.subprocess.run") as mock_run:
-            mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="")
+            mock_run.return_value = MagicMock(
+                returncode=0, stdout="no metrics here", stderr=""
+            )
             result = run_experiment(tmp_path)
 
         assert result["success"] is True
@@ -349,3 +353,107 @@ class TestSystemExperiment:
             success = exp.revert_changes()
 
         assert success is False
+
+    def test_create_branch_success(self, tmp_path):
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0)
+            success = exp.create_branch("feature/test-branch")
+
+        assert success is True
+        # Verify correct git command was called
+        mock_run.assert_called_once()
+        call_args = mock_run.call_args[0][0]
+        assert "checkout" in call_args
+        assert "-b" in call_args
+        assert "feature/test-branch" in call_args
+
+    def test_create_branch_failure(self, tmp_path):
+        import subprocess
+
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.side_effect = subprocess.CalledProcessError(1, "git")
+            success = exp.create_branch("feature/test-branch")
+
+        assert success is False
+
+    def test_run_dry_run_mode(self, tmp_path):
+        """Test that run() in dry_run mode only generates hypotheses."""
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        result = exp.run(max_iterations=3, dry_run=True, program_content="Test program")
+
+        assert result["iterations"] == 3
+        assert result["success"] is False  # No actual experiments run
+        assert len(exp.results) == 3
+        # Each result should have a hypothesis
+        for record in exp.results:
+            assert "hypothesis" in record
+
+    def test_run_with_custom_metric_fn(self, tmp_path):
+        """Test that custom metric_fn is used for metric extraction."""
+        from timmy.autoresearch import SystemExperiment
+
+        def custom_metric_fn(output: str) -> float | None:
+            match = __import__("re").search(r"custom_metric:\s*([0-9.]+)", output)
+            return float(match.group(1)) if match else None
+
+        exp = SystemExperiment(
+            target="x.py",
+            workspace=tmp_path,
+            metric="custom",
+            metric_fn=custom_metric_fn,
+        )
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="custom_metric: 42.5\nother output",
+                stderr="",
+            )
+            tox_result = exp.run_tox()
+
+        assert tox_result["metric"] == pytest.approx(42.5)
+
+    def test_run_single_iteration_success(self, tmp_path):
+        """Test a successful single iteration that finds an improvement."""
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            # Mock tox returning a passing test with metric
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="10 passed in 1.23s",
+                stderr="",
+            )
+            result = exp.run(max_iterations=1, tox_env="unit")
+
+        assert result["iterations"] == 1
+        assert len(exp.results) == 1
+        assert exp.results[0]["metric"] == pytest.approx(100.0)
+
+    def test_run_stores_baseline_on_first_success(self, tmp_path):
+        """Test that baseline is set after first successful iteration."""
+        from timmy.autoresearch import SystemExperiment
+
+        exp = SystemExperiment(target="x.py", workspace=tmp_path)
+        assert exp.baseline is None
+
+        with patch("timmy.autoresearch.subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="8 passed in 1.23s",
+                stderr="",
+            )
+            exp.run(max_iterations=1)
+
+        assert exp.baseline == pytest.approx(100.0)
+        assert exp.results[0]["baseline"] is None  # First run has no baseline
-- 
2.43.0