Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit was merged in pull request #1240.
This commit is contained in:
23
program.md
Normal file
23
program.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Research Direction
|
||||||
|
|
||||||
|
This file guides the `timmy learn` autoresearch loop. Edit it to focus
|
||||||
|
autonomous experiments on a specific goal.
|
||||||
|
|
||||||
|
## Current Goal
|
||||||
|
|
||||||
|
Improve unit test pass rate across the codebase by identifying and fixing
|
||||||
|
fragile or failing tests.
|
||||||
|
|
||||||
|
## Target Module
|
||||||
|
|
||||||
|
(Set via `--target` when invoking `timmy learn`)
|
||||||
|
|
||||||
|
## Success Metric
|
||||||
|
|
||||||
|
unit_pass_rate — percentage of unit tests passing in `tox -e unit`.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Experiments run one at a time; each is time-boxed by `--budget`.
|
||||||
|
- Improvements are committed automatically; regressions are reverted.
|
||||||
|
- Use `--dry-run` to preview hypotheses without making changes.
|
||||||
@@ -212,3 +212,212 @@ def _append_result(workspace: Path, result: dict[str, Any]) -> None:
|
|||||||
results_file.parent.mkdir(parents=True, exist_ok=True)
|
results_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with results_file.open("a") as f:
|
with results_file.open("a") as f:
|
||||||
f.write(json.dumps(result) + "\n")
|
f.write(json.dumps(result) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pass_rate(output: str) -> float | None:
|
||||||
|
"""Extract pytest pass rate as a percentage from tox/pytest output."""
|
||||||
|
passed_m = re.search(r"(\d+) passed", output)
|
||||||
|
failed_m = re.search(r"(\d+) failed", output)
|
||||||
|
if passed_m:
|
||||||
|
passed = int(passed_m.group(1))
|
||||||
|
failed = int(failed_m.group(1)) if failed_m else 0
|
||||||
|
total = passed + failed
|
||||||
|
return (passed / total * 100.0) if total > 0 else 100.0
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_coverage(output: str) -> float | None:
|
||||||
|
"""Extract total coverage percentage from coverage output."""
|
||||||
|
coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
|
||||||
|
if coverage_m:
|
||||||
|
try:
|
||||||
|
return float(coverage_m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Higher-is-better metric names ────────────────────────────────────────────
|
||||||
|
_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
|
||||||
|
|
||||||
|
|
||||||
|
class SystemExperiment:
|
||||||
|
"""An autoresearch experiment targeting a specific module with a configurable metric.
|
||||||
|
|
||||||
|
Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
|
||||||
|
for a single target file or module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
|
||||||
|
metric: Metric to extract from tox output. Built-in values:
|
||||||
|
``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
|
||||||
|
Any other value is forwarded to :func:`_extract_metric`.
|
||||||
|
budget_minutes: Wall-clock budget per experiment (default 5 min).
|
||||||
|
workspace: Working directory for subprocess calls. Defaults to ``cwd``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
target: str,
|
||||||
|
metric: str = "unit_pass_rate",
|
||||||
|
budget_minutes: int = 5,
|
||||||
|
workspace: Path | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.target = target
|
||||||
|
self.metric = metric
|
||||||
|
self.budget_seconds = budget_minutes * 60
|
||||||
|
self.workspace = Path(workspace) if workspace else Path.cwd()
|
||||||
|
|
||||||
|
# ── Hypothesis generation ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def generate_hypothesis(self, program_content: str = "") -> str:
|
||||||
|
"""Return a plain-English hypothesis for the next experiment.
|
||||||
|
|
||||||
|
Uses the first non-empty line of *program_content* when available;
|
||||||
|
falls back to a generic description based on target and metric.
|
||||||
|
"""
|
||||||
|
first_line = ""
|
||||||
|
for line in program_content.splitlines():
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped and not stripped.startswith("#"):
|
||||||
|
first_line = stripped[:120]
|
||||||
|
break
|
||||||
|
if first_line:
|
||||||
|
return f"[{self.target}] {first_line}"
|
||||||
|
return f"Improve {self.metric} for {self.target}"
|
||||||
|
|
||||||
|
# ── Edit phase ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
|
||||||
|
"""Apply code edits to *target* via Aider.
|
||||||
|
|
||||||
|
Returns a status string. Degrades gracefully — never raises.
|
||||||
|
"""
|
||||||
|
prompt = f"Edit {self.target}: {hypothesis}"
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=self.budget_seconds,
|
||||||
|
cwd=str(self.workspace),
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
return result.stdout or "Edit applied."
|
||||||
|
return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.warning("Aider not installed — edit skipped")
|
||||||
|
return "Aider not available — edit skipped"
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.warning("Aider timed out after %ds", self.budget_seconds)
|
||||||
|
return "Aider timed out"
|
||||||
|
except (OSError, subprocess.SubprocessError) as exc:
|
||||||
|
logger.warning("Aider failed: %s", exc)
|
||||||
|
return f"Edit failed: {exc}"
|
||||||
|
|
||||||
|
# ── Evaluation phase ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
|
||||||
|
"""Run *tox_env* and return a result dict.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys: ``metric`` (float|None), ``log`` (str),
|
||||||
|
``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
|
||||||
|
"""
|
||||||
|
start = time.monotonic()
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["tox", "-e", tox_env],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=self.budget_seconds,
|
||||||
|
cwd=str(self.workspace),
|
||||||
|
)
|
||||||
|
duration = int(time.monotonic() - start)
|
||||||
|
output = result.stdout + result.stderr
|
||||||
|
metric_val = self._extract_tox_metric(output)
|
||||||
|
return {
|
||||||
|
"metric": metric_val,
|
||||||
|
"log": output[-3000:],
|
||||||
|
"duration_s": duration,
|
||||||
|
"success": result.returncode == 0,
|
||||||
|
"error": None if result.returncode == 0 else f"Exit code {result.returncode}",
|
||||||
|
}
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
duration = int(time.monotonic() - start)
|
||||||
|
return {
|
||||||
|
"metric": None,
|
||||||
|
"log": f"Budget exceeded after {self.budget_seconds}s",
|
||||||
|
"duration_s": duration,
|
||||||
|
"success": False,
|
||||||
|
"error": f"Budget exceeded after {self.budget_seconds}s",
|
||||||
|
}
|
||||||
|
except OSError as exc:
|
||||||
|
return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
|
||||||
|
|
||||||
|
def _extract_tox_metric(self, output: str) -> float | None:
|
||||||
|
"""Dispatch to the correct metric extractor based on *self.metric*."""
|
||||||
|
if self.metric == "unit_pass_rate":
|
||||||
|
return _extract_pass_rate(output)
|
||||||
|
if self.metric == "coverage":
|
||||||
|
return _extract_coverage(output)
|
||||||
|
return _extract_metric(output, self.metric)
|
||||||
|
|
||||||
|
def evaluate(self, current: float | None, baseline: float | None) -> str:
|
||||||
|
"""Compare *current* metric against *baseline* and return an assessment."""
|
||||||
|
if current is None:
|
||||||
|
return "Indeterminate: metric not extracted from output"
|
||||||
|
if baseline is None:
|
||||||
|
unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
|
||||||
|
return f"Baseline: {self.metric} = {current:.2f}{unit}"
|
||||||
|
|
||||||
|
if self.metric in _HIGHER_IS_BETTER:
|
||||||
|
delta = current - baseline
|
||||||
|
pct = (delta / baseline * 100) if baseline != 0 else 0.0
|
||||||
|
if delta > 0:
|
||||||
|
return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
|
||||||
|
if delta < 0:
|
||||||
|
return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
|
||||||
|
return f"No change: {self.metric} = {current:.2f}%"
|
||||||
|
|
||||||
|
# lower-is-better (val_bpb, loss, etc.)
|
||||||
|
return evaluate_result(current, baseline, self.metric)
|
||||||
|
|
||||||
|
def is_improvement(self, current: float, baseline: float) -> bool:
|
||||||
|
"""Return True if *current* is better than *baseline* for this metric."""
|
||||||
|
if self.metric in _HIGHER_IS_BETTER:
|
||||||
|
return current > baseline
|
||||||
|
return current < baseline # lower-is-better
|
||||||
|
|
||||||
|
# ── Git phase ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def commit_changes(self, message: str) -> bool:
|
||||||
|
"""Stage and commit all changes. Returns True on success."""
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["git", "commit", "-m", message],
|
||||||
|
cwd=str(self.workspace),
|
||||||
|
check=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
logger.warning("Git commit failed: %s", exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def revert_changes(self) -> bool:
|
||||||
|
"""Revert all uncommitted changes. Returns True on success."""
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["git", "checkout", "--", "."],
|
||||||
|
cwd=str(self.workspace),
|
||||||
|
check=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
logger.warning("Git revert failed: %s", exc)
|
||||||
|
return False
|
||||||
|
|||||||
154
src/timmy/cli.py
154
src/timmy/cli.py
@@ -527,5 +527,159 @@ def healthcheck(
|
|||||||
raise typer.Exit(result.returncode)
|
raise typer.Exit(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def learn(
|
||||||
|
target: str | None = typer.Option(
|
||||||
|
None,
|
||||||
|
"--target",
|
||||||
|
"-t",
|
||||||
|
help="Module or file to optimise (e.g. 'src/timmy/agent.py')",
|
||||||
|
),
|
||||||
|
metric: str = typer.Option(
|
||||||
|
"unit_pass_rate",
|
||||||
|
"--metric",
|
||||||
|
"-m",
|
||||||
|
help="Metric to track: unit_pass_rate | coverage | val_bpb | <custom>",
|
||||||
|
),
|
||||||
|
budget: int = typer.Option(
|
||||||
|
5,
|
||||||
|
"--budget",
|
||||||
|
help="Time limit per experiment in minutes",
|
||||||
|
),
|
||||||
|
max_experiments: int = typer.Option(
|
||||||
|
10,
|
||||||
|
"--max-experiments",
|
||||||
|
help="Cap on total experiments per run",
|
||||||
|
),
|
||||||
|
dry_run: bool = typer.Option(
|
||||||
|
False,
|
||||||
|
"--dry-run",
|
||||||
|
help="Show hypothesis without executing experiments",
|
||||||
|
),
|
||||||
|
program_file: str | None = typer.Option(
|
||||||
|
None,
|
||||||
|
"--program",
|
||||||
|
"-p",
|
||||||
|
help="Path to research direction file (default: program.md in cwd)",
|
||||||
|
),
|
||||||
|
tox_env: str = typer.Option(
|
||||||
|
"unit",
|
||||||
|
"--tox-env",
|
||||||
|
help="Tox environment to run for each evaluation",
|
||||||
|
),
|
||||||
|
model: str = typer.Option(
|
||||||
|
"qwen3:30b",
|
||||||
|
"--model",
|
||||||
|
help="Ollama model forwarded to Aider for code edits",
|
||||||
|
),
|
||||||
|
):
|
||||||
|
"""Start an autonomous improvement loop (autoresearch).
|
||||||
|
|
||||||
|
Reads program.md for research direction, then iterates:
|
||||||
|
hypothesis → edit → tox → evaluate → commit/revert.
|
||||||
|
|
||||||
|
Experiments continue until --max-experiments is reached or the loop is
|
||||||
|
interrupted with Ctrl+C. Use --dry-run to preview hypotheses without
|
||||||
|
making any changes.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
timmy learn --target src/timmy/agent.py --metric unit_pass_rate
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
repo_root = Path.cwd()
|
||||||
|
program_path = Path(program_file) if program_file else repo_root / "program.md"
|
||||||
|
|
||||||
|
if program_path.exists():
|
||||||
|
program_content = program_path.read_text()
|
||||||
|
typer.echo(f"Research direction: {program_path}")
|
||||||
|
else:
|
||||||
|
program_content = ""
|
||||||
|
typer.echo(
|
||||||
|
f"Note: {program_path} not found — proceeding without research direction.",
|
||||||
|
err=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if target is None:
|
||||||
|
typer.echo(
|
||||||
|
"Error: --target is required. Specify the module or file to optimise.", err=True
|
||||||
|
)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
experiment = SystemExperiment(
|
||||||
|
target=target,
|
||||||
|
metric=metric,
|
||||||
|
budget_minutes=budget,
|
||||||
|
)
|
||||||
|
|
||||||
|
typer.echo()
|
||||||
|
typer.echo(typer.style("Autoresearch", bold=True) + f" — {target}")
|
||||||
|
typer.echo(f" metric={metric} budget={budget}min max={max_experiments} tox={tox_env}")
|
||||||
|
if dry_run:
|
||||||
|
typer.echo(" (dry-run — no changes will be made)")
|
||||||
|
typer.echo()
|
||||||
|
|
||||||
|
baseline: float | None = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i in range(1, max_experiments + 1):
|
||||||
|
typer.echo(typer.style(f"[{i}/{max_experiments}]", bold=True), nl=False)
|
||||||
|
|
||||||
|
hypothesis = experiment.generate_hypothesis(program_content)
|
||||||
|
typer.echo(f" {hypothesis}")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Edit phase
|
||||||
|
typer.echo(" → editing …", nl=False)
|
||||||
|
edit_result = experiment.apply_edit(hypothesis, model=model)
|
||||||
|
if "not available" in edit_result or edit_result.startswith("Aider error"):
|
||||||
|
typer.echo(f" skipped ({edit_result.split(':')[0]})")
|
||||||
|
else:
|
||||||
|
typer.echo(" done")
|
||||||
|
|
||||||
|
# Evaluate phase
|
||||||
|
typer.echo(" → running tox …", nl=False)
|
||||||
|
tox_result = experiment.run_tox(tox_env=tox_env)
|
||||||
|
typer.echo(f" {tox_result['duration_s']}s")
|
||||||
|
|
||||||
|
assessment = experiment.evaluate(tox_result["metric"], baseline)
|
||||||
|
typer.echo(f" → {assessment}")
|
||||||
|
|
||||||
|
if tox_result["metric"] is not None and baseline is None:
|
||||||
|
baseline = tox_result["metric"]
|
||||||
|
|
||||||
|
if tox_result["success"] and tox_result["metric"] is not None and baseline is not None:
|
||||||
|
if experiment.is_improvement(tox_result["metric"], baseline):
|
||||||
|
commit_msg = (
|
||||||
|
f"autoresearch: improve {metric} on {target} — {assessment}"
|
||||||
|
)
|
||||||
|
if experiment.commit_changes(commit_msg):
|
||||||
|
typer.echo(" → committed")
|
||||||
|
baseline = tox_result["metric"]
|
||||||
|
else:
|
||||||
|
experiment.revert_changes()
|
||||||
|
typer.echo(" → commit failed, reverted")
|
||||||
|
else:
|
||||||
|
experiment.revert_changes()
|
||||||
|
typer.echo(" → reverted (no improvement)")
|
||||||
|
elif not tox_result["success"]:
|
||||||
|
experiment.revert_changes()
|
||||||
|
typer.echo(f" → reverted ({tox_result['error']})")
|
||||||
|
|
||||||
|
typer.echo()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
typer.echo("\nInterrupted.")
|
||||||
|
raise typer.Exit(0) from None
|
||||||
|
|
||||||
|
typer.echo(typer.style("Autoresearch complete.", bold=True))
|
||||||
|
if baseline is not None:
|
||||||
|
typer.echo(f"Final {metric}: {baseline:.4f}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
app()
|
app()
|
||||||
|
|||||||
@@ -176,3 +176,176 @@ class TestExtractMetric:
|
|||||||
|
|
||||||
output = "loss: 0.45\nloss: 0.32"
|
output = "loss: 0.45\nloss: 0.32"
|
||||||
assert _extract_metric(output, "loss") == pytest.approx(0.32)
|
assert _extract_metric(output, "loss") == pytest.approx(0.32)
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractPassRate:
|
||||||
|
"""Tests for _extract_pass_rate()."""
|
||||||
|
|
||||||
|
def test_all_passing(self):
|
||||||
|
from timmy.autoresearch import _extract_pass_rate
|
||||||
|
|
||||||
|
output = "5 passed in 1.23s"
|
||||||
|
assert _extract_pass_rate(output) == pytest.approx(100.0)
|
||||||
|
|
||||||
|
def test_mixed_results(self):
|
||||||
|
from timmy.autoresearch import _extract_pass_rate
|
||||||
|
|
||||||
|
output = "8 passed, 2 failed in 2.00s"
|
||||||
|
assert _extract_pass_rate(output) == pytest.approx(80.0)
|
||||||
|
|
||||||
|
def test_no_pytest_output(self):
|
||||||
|
from timmy.autoresearch import _extract_pass_rate
|
||||||
|
|
||||||
|
assert _extract_pass_rate("no test results here") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractCoverage:
|
||||||
|
"""Tests for _extract_coverage()."""
|
||||||
|
|
||||||
|
def test_total_line(self):
|
||||||
|
from timmy.autoresearch import _extract_coverage
|
||||||
|
|
||||||
|
output = "TOTAL 1234 100 92%"
|
||||||
|
assert _extract_coverage(output) == pytest.approx(92.0)
|
||||||
|
|
||||||
|
def test_no_coverage(self):
|
||||||
|
from timmy.autoresearch import _extract_coverage
|
||||||
|
|
||||||
|
assert _extract_coverage("no coverage data") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSystemExperiment:
|
||||||
|
"""Tests for SystemExperiment class."""
|
||||||
|
|
||||||
|
def test_generate_hypothesis_with_program(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="src/timmy/agent.py")
|
||||||
|
hyp = exp.generate_hypothesis("Fix memory leak in session handling")
|
||||||
|
assert "src/timmy/agent.py" in hyp
|
||||||
|
assert "Fix memory leak" in hyp
|
||||||
|
|
||||||
|
def test_generate_hypothesis_fallback(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="src/timmy/agent.py", metric="coverage")
|
||||||
|
hyp = exp.generate_hypothesis("")
|
||||||
|
assert "src/timmy/agent.py" in hyp
|
||||||
|
assert "coverage" in hyp
|
||||||
|
|
||||||
|
def test_generate_hypothesis_skips_comment_lines(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="mymodule.py")
|
||||||
|
hyp = exp.generate_hypothesis("# comment\nActual direction here")
|
||||||
|
assert "Actual direction" in hyp
|
||||||
|
|
||||||
|
def test_evaluate_baseline(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
||||||
|
result = exp.evaluate(85.0, None)
|
||||||
|
assert "Baseline" in result
|
||||||
|
assert "85" in result
|
||||||
|
|
||||||
|
def test_evaluate_improvement_higher_is_better(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
||||||
|
result = exp.evaluate(90.0, 85.0)
|
||||||
|
assert "Improvement" in result
|
||||||
|
|
||||||
|
def test_evaluate_regression_higher_is_better(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="coverage")
|
||||||
|
result = exp.evaluate(80.0, 85.0)
|
||||||
|
assert "Regression" in result
|
||||||
|
|
||||||
|
def test_evaluate_none_metric(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py")
|
||||||
|
result = exp.evaluate(None, 80.0)
|
||||||
|
assert "Indeterminate" in result
|
||||||
|
|
||||||
|
def test_evaluate_lower_is_better(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="val_bpb")
|
||||||
|
result = exp.evaluate(1.1, 1.2)
|
||||||
|
assert "Improvement" in result
|
||||||
|
|
||||||
|
def test_is_improvement_higher_is_better(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="unit_pass_rate")
|
||||||
|
assert exp.is_improvement(90.0, 85.0) is True
|
||||||
|
assert exp.is_improvement(80.0, 85.0) is False
|
||||||
|
|
||||||
|
def test_is_improvement_lower_is_better(self):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", metric="val_bpb")
|
||||||
|
assert exp.is_improvement(1.1, 1.2) is True
|
||||||
|
assert exp.is_improvement(1.3, 1.2) is False
|
||||||
|
|
||||||
|
def test_run_tox_success(self, tmp_path):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=0,
|
||||||
|
stdout="8 passed in 1.23s",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
result = exp.run_tox(tox_env="unit")
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["metric"] == pytest.approx(100.0)
|
||||||
|
|
||||||
|
def test_run_tox_timeout(self, tmp_path):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", budget_minutes=1, workspace=tmp_path)
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.side_effect = subprocess.TimeoutExpired(cmd="tox", timeout=60)
|
||||||
|
result = exp.run_tox()
|
||||||
|
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "Budget exceeded" in result["error"]
|
||||||
|
|
||||||
|
def test_apply_edit_aider_not_installed(self, tmp_path):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.side_effect = FileNotFoundError("aider not found")
|
||||||
|
result = exp.apply_edit("some hypothesis")
|
||||||
|
|
||||||
|
assert "not available" in result
|
||||||
|
|
||||||
|
def test_commit_changes_success(self, tmp_path):
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0)
|
||||||
|
success = exp.commit_changes("test commit")
|
||||||
|
|
||||||
|
assert success is True
|
||||||
|
|
||||||
|
def test_revert_changes_failure(self, tmp_path):
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from timmy.autoresearch import SystemExperiment
|
||||||
|
|
||||||
|
exp = SystemExperiment(target="x.py", workspace=tmp_path)
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.side_effect = subprocess.CalledProcessError(1, "git")
|
||||||
|
success = exp.revert_changes()
|
||||||
|
|
||||||
|
assert success is False
|
||||||
|
|||||||
94
tests/timmy/test_cli_learn.py
Normal file
94
tests/timmy/test_cli_learn.py
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
"""Tests for the `timmy learn` CLI command (autoresearch entry point)."""
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from timmy.cli import app
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
class TestLearnCommand:
|
||||||
|
"""Tests for `timmy learn`."""
|
||||||
|
|
||||||
|
def test_requires_target(self):
|
||||||
|
result = runner.invoke(app, ["learn"])
|
||||||
|
assert result.exit_code != 0
|
||||||
|
assert "target" in result.output.lower() or "target" in (result.stderr or "").lower()
|
||||||
|
|
||||||
|
def test_dry_run_shows_hypothesis_no_tox(self, tmp_path):
|
||||||
|
program_file = tmp_path / "program.md"
|
||||||
|
program_file.write_text("Improve logging coverage in agent module")
|
||||||
|
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"learn",
|
||||||
|
"--target",
|
||||||
|
"src/timmy/agent.py",
|
||||||
|
"--program",
|
||||||
|
str(program_file),
|
||||||
|
"--max-experiments",
|
||||||
|
"2",
|
||||||
|
"--dry-run",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
# tox should never be called in dry-run
|
||||||
|
mock_run.assert_not_called()
|
||||||
|
assert "agent.py" in result.output
|
||||||
|
|
||||||
|
def test_missing_program_md_warns_but_continues(self, tmp_path):
|
||||||
|
with patch("timmy.autoresearch.subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value = MagicMock(returncode=0, stdout="3 passed", stderr="")
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"learn",
|
||||||
|
"--target",
|
||||||
|
"src/timmy/agent.py",
|
||||||
|
"--program",
|
||||||
|
str(tmp_path / "nonexistent.md"),
|
||||||
|
"--max-experiments",
|
||||||
|
"1",
|
||||||
|
"--dry-run",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
def test_dry_run_prints_max_experiments_hypotheses(self, tmp_path):
|
||||||
|
program_file = tmp_path / "program.md"
|
||||||
|
program_file.write_text("Fix edge case in parser")
|
||||||
|
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[
|
||||||
|
"learn",
|
||||||
|
"--target",
|
||||||
|
"src/timmy/parser.py",
|
||||||
|
"--program",
|
||||||
|
str(program_file),
|
||||||
|
"--max-experiments",
|
||||||
|
"3",
|
||||||
|
"--dry-run",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
# Should show 3 experiment headers
|
||||||
|
assert result.output.count("[1/3]") == 1
|
||||||
|
assert result.output.count("[2/3]") == 1
|
||||||
|
assert result.output.count("[3/3]") == 1
|
||||||
|
|
||||||
|
def test_help_text_present(self):
|
||||||
|
result = runner.invoke(app, ["learn", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "--target" in result.output
|
||||||
|
assert "--metric" in result.output
|
||||||
|
assert "--budget" in result.output
|
||||||
|
assert "--max-experiments" in result.output
|
||||||
|
assert "--dry-run" in result.output
|
||||||
Reference in New Issue
Block a user