[claude] Add timmy learn autoresearch entry point (#907) (#1240)
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
Tests / lint (pull_request) Failing after 16s
Tests / test (pull_request) Has been skipped

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit was merged in pull request #1240.
This commit is contained in:
2026-03-23 23:14:09 +00:00
committed by rockachopa
parent ea76af068a
commit 6691f4d1f3
5 changed files with 653 additions and 0 deletions

23
program.md Normal file
View File

@@ -0,0 +1,23 @@
# Research Direction
This file guides the `timmy learn` autoresearch loop. Edit it to focus
autonomous experiments on a specific goal.
## Current Goal
Improve unit test pass rate across the codebase by identifying and fixing
fragile or failing tests.
## Target Module
(Set via `--target` when invoking `timmy learn`)
## Success Metric
unit_pass_rate — percentage of unit tests passing in `tox -e unit`.
## Notes
- Experiments run one at a time; each is time-boxed by `--budget`.
- Improvements are committed automatically; regressions are reverted.
- Use `--dry-run` to preview hypotheses without making changes.

View File

@@ -212,3 +212,212 @@ def _append_result(workspace: Path, result: dict[str, Any]) -> None:
results_file.parent.mkdir(parents=True, exist_ok=True) results_file.parent.mkdir(parents=True, exist_ok=True)
with results_file.open("a") as f: with results_file.open("a") as f:
f.write(json.dumps(result) + "\n") f.write(json.dumps(result) + "\n")
def _extract_pass_rate(output: str) -> float | None:
"""Extract pytest pass rate as a percentage from tox/pytest output."""
passed_m = re.search(r"(\d+) passed", output)
failed_m = re.search(r"(\d+) failed", output)
if passed_m:
passed = int(passed_m.group(1))
failed = int(failed_m.group(1)) if failed_m else 0
total = passed + failed
return (passed / total * 100.0) if total > 0 else 100.0
return None
def _extract_coverage(output: str) -> float | None:
"""Extract total coverage percentage from coverage output."""
coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
if coverage_m:
try:
return float(coverage_m.group(1))
except ValueError:
pass
return None
# ── Higher-is-better metric names ────────────────────────────────────────────
_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
class SystemExperiment:
"""An autoresearch experiment targeting a specific module with a configurable metric.
Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
for a single target file or module.
Args:
target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
metric: Metric to extract from tox output. Built-in values:
``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
Any other value is forwarded to :func:`_extract_metric`.
budget_minutes: Wall-clock budget per experiment (default 5 min).
workspace: Working directory for subprocess calls. Defaults to ``cwd``.
"""
def __init__(
self,
target: str,
metric: str = "unit_pass_rate",
budget_minutes: int = 5,
workspace: Path | None = None,
) -> None:
self.target = target
self.metric = metric
self.budget_seconds = budget_minutes * 60
self.workspace = Path(workspace) if workspace else Path.cwd()
# ── Hypothesis generation ─────────────────────────────────────────────────
def generate_hypothesis(self, program_content: str = "") -> str:
"""Return a plain-English hypothesis for the next experiment.
Uses the first non-empty line of *program_content* when available;
falls back to a generic description based on target and metric.
"""
first_line = ""
for line in program_content.splitlines():
stripped = line.strip()
if stripped and not stripped.startswith("#"):
first_line = stripped[:120]
break
if first_line:
return f"[{self.target}] {first_line}"
return f"Improve {self.metric} for {self.target}"
# ── Edit phase ────────────────────────────────────────────────────────────
def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
"""Apply code edits to *target* via Aider.
Returns a status string. Degrades gracefully — never raises.
"""
prompt = f"Edit {self.target}: {hypothesis}"
try:
result = subprocess.run(
["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
capture_output=True,
text=True,
timeout=self.budget_seconds,
cwd=str(self.workspace),
)
if result.returncode == 0:
return result.stdout or "Edit applied."
return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
except FileNotFoundError:
logger.warning("Aider not installed — edit skipped")
return "Aider not available — edit skipped"
except subprocess.TimeoutExpired:
logger.warning("Aider timed out after %ds", self.budget_seconds)
return "Aider timed out"
except (OSError, subprocess.SubprocessError) as exc:
logger.warning("Aider failed: %s", exc)
return f"Edit failed: {exc}"
# ── Evaluation phase ──────────────────────────────────────────────────────
def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
"""Run *tox_env* and return a result dict.
Returns:
Dict with keys: ``metric`` (float|None), ``log`` (str),
``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
"""
start = time.monotonic()
try:
result = subprocess.run(
["tox", "-e", tox_env],
capture_output=True,
text=True,
timeout=self.budget_seconds,
cwd=str(self.workspace),
)
duration = int(time.monotonic() - start)
output = result.stdout + result.stderr
metric_val = self._extract_tox_metric(output)
return {
"metric": metric_val,
"log": output[-3000:],
"duration_s": duration,
"success": result.returncode == 0,
"error": None if result.returncode == 0 else f"Exit code {result.returncode}",
}
except subprocess.TimeoutExpired:
duration = int(time.monotonic() - start)
return {
"metric": None,
"log": f"Budget exceeded after {self.budget_seconds}s",
"duration_s": duration,
"success": False,
"error": f"Budget exceeded after {self.budget_seconds}s",
}
except OSError as exc:
return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
def _extract_tox_metric(self, output: str) -> float | None:
"""Dispatch to the correct metric extractor based on *self.metric*."""
if self.metric == "unit_pass_rate":
return _extract_pass_rate(output)
if self.metric == "coverage":
return _extract_coverage(output)
return _extract_metric(output, self.metric)
def evaluate(self, current: float | None, baseline: float | None) -> str:
"""Compare *current* metric against *baseline* and return an assessment."""
if current is None:
return "Indeterminate: metric not extracted from output"
if baseline is None:
unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
return f"Baseline: {self.metric} = {current:.2f}{unit}"
if self.metric in _HIGHER_IS_BETTER:
delta = current - baseline
pct = (delta / baseline * 100) if baseline != 0 else 0.0
if delta > 0:
return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
if delta < 0:
return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
return f"No change: {self.metric} = {current:.2f}%"
# lower-is-better (val_bpb, loss, etc.)
return evaluate_result(current, baseline, self.metric)
def is_improvement(self, current: float, baseline: float) -> bool:
"""Return True if *current* is better than *baseline* for this metric."""
if self.metric in _HIGHER_IS_BETTER:
return current > baseline
return current < baseline # lower-is-better
# ── Git phase ─────────────────────────────────────────────────────────────
def commit_changes(self, message: str) -> bool:
"""Stage and commit all changes. Returns True on success."""
try:
subprocess.run(
["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
)
subprocess.run(
["git", "commit", "-m", message],
cwd=str(self.workspace),
check=True,
timeout=30,
)
return True
except subprocess.CalledProcessError as exc:
logger.warning("Git commit failed: %s", exc)
return False
def revert_changes(self) -> bool:
"""Revert all uncommitted changes. Returns True on success."""
try:
subprocess.run(
["git", "checkout", "--", "."],
cwd=str(self.workspace),
check=True,
timeout=30,
)
return True
except subprocess.CalledProcessError as exc:
logger.warning("Git revert failed: %s", exc)
return False

View File

@@ -527,5 +527,159 @@ def healthcheck(
raise typer.Exit(result.returncode) raise typer.Exit(result.returncode)
@app.command()
def learn(
target: str | None = typer.Option(
None,
"--target",
"-t",
help="Module or file to optimise (e.g. 'src/timmy/agent.py')",
),
metric: str = typer.Option(
"unit_pass_rate",
"--metric",
"-m",
help="Metric to track: unit_pass_rate | coverage | val_bpb | <custom>",
),
budget: int = typer.Option(
5,
"--budget",
help="Time limit per experiment in minutes",
),
max_experiments: int = typer.Option(
10,
"--max-experiments",
help="Cap on total experiments per run",
),
dry_run: bool = typer.Option(
False,
"--dry-run",
help="Show hypothesis without executing experiments",
),
program_file: str | None = typer.Option(
None,
"--program",
"-p",
help="Path to research direction file (default: program.md in cwd)",
),
tox_env: str = typer.Option(
"unit",
"--tox-env",
help="Tox environment to run for each evaluation",
),
model: str = typer.Option(
"qwen3:30b",
"--model",
help="Ollama model forwarded to Aider for code edits",
),
):
"""Start an autonomous improvement loop (autoresearch).
Reads program.md for research direction, then iterates:
hypothesis → edit → tox → evaluate → commit/revert.
Experiments continue until --max-experiments is reached or the loop is
interrupted with Ctrl+C. Use --dry-run to preview hypotheses without
making any changes.
Example:
timmy learn --target src/timmy/agent.py --metric unit_pass_rate
"""
from pathlib import Path
from timmy.autoresearch import SystemExperiment
repo_root = Path.cwd()
program_path = Path(program_file) if program_file else repo_root / "program.md"
if program_path.exists():
program_content = program_path.read_text()
typer.echo(f"Research direction: {program_path}")
else:
program_content = ""
typer.echo(
f"Note: {program_path} not found — proceeding without research direction.",
err=True,
)
if target is None:
typer.echo(
"Error: --target is required. Specify the module or file to optimise.", err=True
)
raise typer.Exit(1)
experiment = SystemExperiment(
target=target,
metric=metric,
budget_minutes=budget,
)
typer.echo()
typer.echo(typer.style("Autoresearch", bold=True) + f"{target}")
typer.echo(f" metric={metric} budget={budget}min max={max_experiments} tox={tox_env}")
if dry_run:
typer.echo(" (dry-run — no changes will be made)")
typer.echo()
baseline: float | None = None
try:
for i in range(1, max_experiments + 1):
typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False)
hypothesis = experiment.generate_hypothesis(program_content)
typer.echo(f" {hypothesis}")
if dry_run:
continue
# Edit phase
typer.echo(" → editing …", nl=False)
edit_result = experiment.apply_edit(hypothesis, model=model)
if "not available" in edit_result or edit_result.startswith("Aider error"):
typer.echo(f" skipped ({edit_result.split(':')[0]})")
else:
typer.echo(" done")
# Evaluate phase
typer.echo(" → running tox …", nl=False)
tox_result = experiment.run_tox(tox_env=tox_env)
typer.echo(f" {tox_result['duration_s']}s")
assessment = experiment.evaluate(tox_result["metric"], baseline)
typer.echo(f"{assessment}")
if tox_result["metric"] is not None and baseline is None:
baseline = tox_result["metric"]
if tox_result["success"] and tox_result["metric"] is not None and baseline is not None:
if experiment.is_improvement(tox_result["metric"], baseline):
commit_msg = (
f"autoresearch: improve {metric} on {target}{assessment}"
)
if experiment.commit_changes(commit_msg):
typer.echo(" → committed")
baseline = tox_result["metric"]
else:
experiment.revert_changes()
typer.echo(" → commit failed, reverted")
else:
experiment.revert_changes()
typer.echo(" → reverted (no improvement)")
elif not tox_result["success"]:
experiment.revert_changes()
typer.echo(f" → reverted ({tox_result['error']})")
typer.echo()
except KeyboardInterrupt:
typer.echo("\nInterrupted.")
raise typer.Exit(0) from None
typer.echo(typer.style("Autoresearch complete.", bold=True))
if baseline is not None:
typer.echo(f"Final {metric}: {baseline:.4f}")
def main(): def main():
app() app()

View File

@@ -176,3 +176,176 @@ class TestExtractMetric:
output = "loss: 0.45\nloss: 0.32" output = "loss: 0.45\nloss: 0.32"
assert _extract_metric(output, "loss") == pytest.approx(0.32) assert _extract_metric(output, "loss") == pytest.approx(0.32)
class TestExtractPassRate:
"""Tests for _extract_pass_rate()."""
def test_all_passing(self):
from timmy.autoresearch import _extract_pass_rate
output = "5 passed in 1.23s"
assert _extract_pass_rate(output) == pytest.approx(100.0)
def test_mixed_results(self):
from timmy.autoresearch import _extract_pass_rate
output = "8 passed, 2 failed in 2.00s"
assert _extract_pass_rate(output) == pytest.approx(80.0)
def test_no_pytest_output(self):
from timmy.autoresearch import _extract_pass_rate
assert _extract_pass_rate("no test results here") is None
class TestExtractCoverage:
"""Tests for _extract_coverage()."""
def test_total_line(self):
from timmy.autoresearch import _extract_coverage
output = "TOTAL 1234 100 92%"
assert _extract_coverage(output) == pytest.approx(92.0)
def test_no_coverage(self):
from timmy.autoresearch import _extract_coverage
assert _extract_coverage("no coverage data") is None
class TestSystemExperiment:
"""Tests for SystemExperiment class."""
def test_generate_hypothesis_with_program(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="src/timmy/agent.py")
hyp = exp.generate_hypothesis("Fix memory leak in session handling")
assert "src/timmy/agent.py" in hyp
assert "Fix memory leak" in hyp
def test_generate_hypothesis_fallback(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
hyp = exp.generate_hypothesis("")
assert "src/timmy/agent.py" in hyp
assert "coverage" in hyp
def test_generate_hypothesis_skips_comment_lines(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="mymodule.py")
hyp = exp.generate_hypothesis("# comment\nActual direction here")
assert "Actual direction" in hyp
def test_evaluate_baseline(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
result = exp.evaluate(85.0, None)
assert "Baseline" in result
assert "85" in result
def test_evaluate_improvement_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
result = exp.evaluate(90.0, 85.0)
assert "Improvement" in result
def test_evaluate_regression_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="coverage")
result = exp.evaluate(80.0, 85.0)
assert "Regression" in result
def test_evaluate_none_metric(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py")
result = exp.evaluate(None, 80.0)
assert "Indeterminate" in result
def test_evaluate_lower_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="val_bpb")
result = exp.evaluate(1.1, 1.2)
assert "Improvement" in result
def test_is_improvement_higher_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
assert exp.is_improvement(90.0, 85.0) is True
assert exp.is_improvement(80.0, 85.0) is False
def test_is_improvement_lower_is_better(self):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", metric="val_bpb")
assert exp.is_improvement(1.1, 1.2) is True
assert exp.is_improvement(1.3, 1.2) is False
def test_run_tox_success(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="8 passed in 1.23s",
stderr="",
)
result = exp.run_tox(tox_env="unit")
assert result["success"] is True
assert result["metric"] == pytest.approx(100.0)
def test_run_tox_timeout(self, tmp_path):
import subprocess
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
result = exp.run_tox()
assert result["success"] is False
assert "Budget exceeded" in result["error"]
def test_apply_edit_aider_not_installed(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = FileNotFoundError("aider not found")
result = exp.apply_edit("some hypothesis")
assert "not available" in result
def test_commit_changes_success(self, tmp_path):
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0)
success = exp.commit_changes("test commit")
assert success is True
def test_revert_changes_failure(self, tmp_path):
import subprocess
from timmy.autoresearch import SystemExperiment
exp = SystemExperiment(target="x.py", workspace=tmp_path)
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
success = exp.revert_changes()
assert success is False

View File

@@ -0,0 +1,94 @@
"""Tests for the `timmy learn` CLI command (autoresearch entry point)."""
from unittest.mock import MagicMock, patch
from typer.testing import CliRunner
from timmy.cli import app
runner = CliRunner()
class TestLearnCommand:
"""Tests for `timmy learn`."""
def test_requires_target(self):
result = runner.invoke(app, ["learn"])
assert result.exit_code != 0
assert "target" in result.output.lower() or "target" in (result.stderr or "").lower()
def test_dry_run_shows_hypothesis_no_tox(self, tmp_path):
program_file = tmp_path / "program.md"
program_file.write_text("Improve logging coverage in agent module")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
result = runner.invoke(
app,
[
"learn",
"--target",
"src/timmy/agent.py",
"--program",
str(program_file),
"--max-experiments",
"2",
"--dry-run",
],
)
assert result.exit_code == 0
# tox should never be called in dry-run
mock_run.assert_not_called()
assert "agent.py" in result.output
def test_missing_program_md_warns_but_continues(self, tmp_path):
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="3 passed", stderr="")
result = runner.invoke(
app,
[
"learn",
"--target",
"src/timmy/agent.py",
"--program",
str(tmp_path / "nonexistent.md"),
"--max-experiments",
"1",
"--dry-run",
],
)
assert result.exit_code == 0
def test_dry_run_prints_max_experiments_hypotheses(self, tmp_path):
program_file = tmp_path / "program.md"
program_file.write_text("Fix edge case in parser")
result = runner.invoke(
app,
[
"learn",
"--target",
"src/timmy/parser.py",
"--program",
str(program_file),
"--max-experiments",
"3",
"--dry-run",
],
)
assert result.exit_code == 0
# Should show 3 experiment headers
assert result.output.count("[1/3]") == 1
assert result.output.count("[2/3]") == 1
assert result.output.count("[3/3]") == 1
def test_help_text_present(self):
result = runner.invoke(app, ["learn", "--help"])
assert result.exit_code == 0
assert "--target" in result.output
assert "--metric" in result.output
assert "--budget" in result.output
assert "--max-experiments" in result.output
assert "--dry-run" in result.output