Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
424 lines
15 KiB
Python
424 lines
15 KiB
Python
"""Autoresearch — autonomous ML experiment loops.
|
|
|
|
Integrates Karpathy's autoresearch pattern: an agent modifies training
|
|
code, runs time-boxed GPU experiments, evaluates a target metric
|
|
(val_bpb by default), and iterates to find improvements.
|
|
|
|
Flow:
|
|
1. prepare_experiment — clone repo + run data prep
|
|
2. run_experiment — execute train.py with wall-clock timeout
|
|
3. evaluate_result — compare metric against baseline
|
|
4. experiment_loop — orchestrate the full cycle
|
|
|
|
All subprocess calls are guarded with timeouts for graceful degradation.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_REPO = "https://github.com/karpathy/autoresearch.git"
|
|
_METRIC_RE = re.compile(r"val_bpb[:\s]+([0-9]+\.?[0-9]*)")
|
|
|
|
|
|
def prepare_experiment(
|
|
workspace: Path,
|
|
repo_url: str = DEFAULT_REPO,
|
|
) -> str:
|
|
"""Clone autoresearch repo and run data preparation.
|
|
|
|
Args:
|
|
workspace: Directory to set up the experiment in.
|
|
repo_url: Git URL for the autoresearch repository.
|
|
|
|
Returns:
|
|
Status message describing what was prepared.
|
|
"""
|
|
workspace = Path(workspace)
|
|
workspace.mkdir(parents=True, exist_ok=True)
|
|
|
|
repo_dir = workspace / "autoresearch"
|
|
if not repo_dir.exists():
|
|
logger.info("Cloning autoresearch into %s", repo_dir)
|
|
result = subprocess.run(
|
|
["git", "clone", "--depth", "1", repo_url, str(repo_dir)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
if result.returncode != 0:
|
|
return f"Clone failed: {result.stderr.strip()}"
|
|
else:
|
|
logger.info("Autoresearch repo already present at %s", repo_dir)
|
|
|
|
# Run prepare.py (data download + tokeniser training)
|
|
prepare_script = repo_dir / "prepare.py"
|
|
if prepare_script.exists():
|
|
logger.info("Running prepare.py …")
|
|
result = subprocess.run(
|
|
["python", str(prepare_script)],
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=str(repo_dir),
|
|
timeout=300,
|
|
)
|
|
if result.returncode != 0:
|
|
return f"Preparation failed: {result.stderr.strip()[:500]}"
|
|
return "Preparation complete — data downloaded and tokeniser trained."
|
|
|
|
return "Preparation skipped — no prepare.py found."
|
|
|
|
|
|
def run_experiment(
|
|
workspace: Path,
|
|
timeout: int = 300,
|
|
metric_name: str = "val_bpb",
|
|
) -> dict[str, Any]:
|
|
"""Run a single training experiment with a wall-clock timeout.
|
|
|
|
Args:
|
|
workspace: Experiment workspace (contains autoresearch/ subdir).
|
|
timeout: Maximum wall-clock seconds for the run.
|
|
metric_name: Name of the metric to extract from stdout.
|
|
|
|
Returns:
|
|
Dict with keys: metric (float|None), log (str), duration_s (int),
|
|
success (bool), error (str|None).
|
|
"""
|
|
repo_dir = Path(workspace) / "autoresearch"
|
|
train_script = repo_dir / "train.py"
|
|
|
|
if not train_script.exists():
|
|
return {
|
|
"metric": None,
|
|
"log": "",
|
|
"duration_s": 0,
|
|
"success": False,
|
|
"error": f"train.py not found in {repo_dir}",
|
|
}
|
|
|
|
start = time.monotonic()
|
|
try:
|
|
result = subprocess.run(
|
|
["python", str(train_script)],
|
|
capture_output=True,
|
|
text=True,
|
|
cwd=str(repo_dir),
|
|
timeout=timeout,
|
|
)
|
|
duration = int(time.monotonic() - start)
|
|
output = result.stdout + result.stderr
|
|
|
|
# Extract metric from output
|
|
metric_val = _extract_metric(output, metric_name)
|
|
|
|
return {
|
|
"metric": metric_val,
|
|
"log": output[-2000:], # Keep last 2k chars
|
|
"duration_s": duration,
|
|
"success": result.returncode == 0,
|
|
"error": None if result.returncode == 0 else f"Exit code {result.returncode}",
|
|
}
|
|
except subprocess.TimeoutExpired:
|
|
duration = int(time.monotonic() - start)
|
|
return {
|
|
"metric": None,
|
|
"log": f"Experiment timed out after {timeout}s",
|
|
"duration_s": duration,
|
|
"success": False,
|
|
"error": f"Timed out after {timeout}s",
|
|
}
|
|
except OSError as exc:
|
|
return {
|
|
"metric": None,
|
|
"log": "",
|
|
"duration_s": 0,
|
|
"success": False,
|
|
"error": str(exc),
|
|
}
|
|
|
|
|
|
def _extract_metric(output: str, metric_name: str = "val_bpb") -> float | None:
|
|
"""Extract the last occurrence of a metric value from training output."""
|
|
pattern = re.compile(rf"{re.escape(metric_name)}[:\s]+([0-9]+\.?[0-9]*)")
|
|
matches = pattern.findall(output)
|
|
if matches:
|
|
try:
|
|
return float(matches[-1])
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
|
|
def evaluate_result(
|
|
current: float,
|
|
baseline: float,
|
|
metric_name: str = "val_bpb",
|
|
) -> str:
|
|
"""Compare a metric against baseline and return an assessment.
|
|
|
|
For val_bpb, lower is better.
|
|
|
|
Args:
|
|
current: Current experiment's metric value.
|
|
baseline: Baseline metric to compare against.
|
|
metric_name: Name of the metric (for display).
|
|
|
|
Returns:
|
|
Human-readable assessment string.
|
|
"""
|
|
delta = current - baseline
|
|
pct = (delta / baseline) * 100 if baseline != 0 else 0.0
|
|
|
|
if delta < 0:
|
|
return f"Improvement: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
|
|
elif delta > 0:
|
|
return f"Regression: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
|
|
else:
|
|
return f"No change: {metric_name} = {current:.4f}"
|
|
|
|
|
|
def get_experiment_history(workspace: Path) -> list[dict[str, Any]]:
|
|
"""Read experiment history from the workspace results file.
|
|
|
|
Returns:
|
|
List of experiment result dicts, most recent first.
|
|
"""
|
|
results_file = Path(workspace) / "results.jsonl"
|
|
if not results_file.exists():
|
|
return []
|
|
|
|
history: list[dict[str, Any]] = []
|
|
for line in results_file.read_text().strip().splitlines():
|
|
try:
|
|
history.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
return list(reversed(history))
|
|
|
|
|
|
def _append_result(workspace: Path, result: dict[str, Any]) -> None:
|
|
"""Append a result to the workspace JSONL log."""
|
|
results_file = Path(workspace) / "results.jsonl"
|
|
results_file.parent.mkdir(parents=True, exist_ok=True)
|
|
with results_file.open("a") as f:
|
|
f.write(json.dumps(result) + "\n")
|
|
|
|
|
|
def _extract_pass_rate(output: str) -> float | None:
|
|
"""Extract pytest pass rate as a percentage from tox/pytest output."""
|
|
passed_m = re.search(r"(\d+) passed", output)
|
|
failed_m = re.search(r"(\d+) failed", output)
|
|
if passed_m:
|
|
passed = int(passed_m.group(1))
|
|
failed = int(failed_m.group(1)) if failed_m else 0
|
|
total = passed + failed
|
|
return (passed / total * 100.0) if total > 0 else 100.0
|
|
return None
|
|
|
|
|
|
def _extract_coverage(output: str) -> float | None:
|
|
"""Extract total coverage percentage from coverage output."""
|
|
coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
|
|
if coverage_m:
|
|
try:
|
|
return float(coverage_m.group(1))
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
|
|
# ── Higher-is-better metric names ────────────────────────────────────────────
|
|
_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})
|
|
|
|
|
|
class SystemExperiment:
|
|
"""An autoresearch experiment targeting a specific module with a configurable metric.
|
|
|
|
Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
|
|
for a single target file or module.
|
|
|
|
Args:
|
|
target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
|
|
metric: Metric to extract from tox output. Built-in values:
|
|
``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
|
|
Any other value is forwarded to :func:`_extract_metric`.
|
|
budget_minutes: Wall-clock budget per experiment (default 5 min).
|
|
workspace: Working directory for subprocess calls. Defaults to ``cwd``.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
target: str,
|
|
metric: str = "unit_pass_rate",
|
|
budget_minutes: int = 5,
|
|
workspace: Path | None = None,
|
|
) -> None:
|
|
self.target = target
|
|
self.metric = metric
|
|
self.budget_seconds = budget_minutes * 60
|
|
self.workspace = Path(workspace) if workspace else Path.cwd()
|
|
|
|
# ── Hypothesis generation ─────────────────────────────────────────────────
|
|
|
|
def generate_hypothesis(self, program_content: str = "") -> str:
|
|
"""Return a plain-English hypothesis for the next experiment.
|
|
|
|
Uses the first non-empty line of *program_content* when available;
|
|
falls back to a generic description based on target and metric.
|
|
"""
|
|
first_line = ""
|
|
for line in program_content.splitlines():
|
|
stripped = line.strip()
|
|
if stripped and not stripped.startswith("#"):
|
|
first_line = stripped[:120]
|
|
break
|
|
if first_line:
|
|
return f"[{self.target}] {first_line}"
|
|
return f"Improve {self.metric} for {self.target}"
|
|
|
|
# ── Edit phase ────────────────────────────────────────────────────────────
|
|
|
|
def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
|
|
"""Apply code edits to *target* via Aider.
|
|
|
|
Returns a status string. Degrades gracefully — never raises.
|
|
"""
|
|
prompt = f"Edit {self.target}: {hypothesis}"
|
|
try:
|
|
result = subprocess.run(
|
|
["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=self.budget_seconds,
|
|
cwd=str(self.workspace),
|
|
)
|
|
if result.returncode == 0:
|
|
return result.stdout or "Edit applied."
|
|
return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
|
|
except FileNotFoundError:
|
|
logger.warning("Aider not installed — edit skipped")
|
|
return "Aider not available — edit skipped"
|
|
except subprocess.TimeoutExpired:
|
|
logger.warning("Aider timed out after %ds", self.budget_seconds)
|
|
return "Aider timed out"
|
|
except (OSError, subprocess.SubprocessError) as exc:
|
|
logger.warning("Aider failed: %s", exc)
|
|
return f"Edit failed: {exc}"
|
|
|
|
# ── Evaluation phase ──────────────────────────────────────────────────────
|
|
|
|
def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
|
|
"""Run *tox_env* and return a result dict.
|
|
|
|
Returns:
|
|
Dict with keys: ``metric`` (float|None), ``log`` (str),
|
|
``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
|
|
"""
|
|
start = time.monotonic()
|
|
try:
|
|
result = subprocess.run(
|
|
["tox", "-e", tox_env],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=self.budget_seconds,
|
|
cwd=str(self.workspace),
|
|
)
|
|
duration = int(time.monotonic() - start)
|
|
output = result.stdout + result.stderr
|
|
metric_val = self._extract_tox_metric(output)
|
|
return {
|
|
"metric": metric_val,
|
|
"log": output[-3000:],
|
|
"duration_s": duration,
|
|
"success": result.returncode == 0,
|
|
"error": None if result.returncode == 0 else f"Exit code {result.returncode}",
|
|
}
|
|
except subprocess.TimeoutExpired:
|
|
duration = int(time.monotonic() - start)
|
|
return {
|
|
"metric": None,
|
|
"log": f"Budget exceeded after {self.budget_seconds}s",
|
|
"duration_s": duration,
|
|
"success": False,
|
|
"error": f"Budget exceeded after {self.budget_seconds}s",
|
|
}
|
|
except OSError as exc:
|
|
return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}
|
|
|
|
def _extract_tox_metric(self, output: str) -> float | None:
|
|
"""Dispatch to the correct metric extractor based on *self.metric*."""
|
|
if self.metric == "unit_pass_rate":
|
|
return _extract_pass_rate(output)
|
|
if self.metric == "coverage":
|
|
return _extract_coverage(output)
|
|
return _extract_metric(output, self.metric)
|
|
|
|
def evaluate(self, current: float | None, baseline: float | None) -> str:
|
|
"""Compare *current* metric against *baseline* and return an assessment."""
|
|
if current is None:
|
|
return "Indeterminate: metric not extracted from output"
|
|
if baseline is None:
|
|
unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
|
|
return f"Baseline: {self.metric} = {current:.2f}{unit}"
|
|
|
|
if self.metric in _HIGHER_IS_BETTER:
|
|
delta = current - baseline
|
|
pct = (delta / baseline * 100) if baseline != 0 else 0.0
|
|
if delta > 0:
|
|
return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
|
|
if delta < 0:
|
|
return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
|
|
return f"No change: {self.metric} = {current:.2f}%"
|
|
|
|
# lower-is-better (val_bpb, loss, etc.)
|
|
return evaluate_result(current, baseline, self.metric)
|
|
|
|
def is_improvement(self, current: float, baseline: float) -> bool:
|
|
"""Return True if *current* is better than *baseline* for this metric."""
|
|
if self.metric in _HIGHER_IS_BETTER:
|
|
return current > baseline
|
|
return current < baseline # lower-is-better
|
|
|
|
# ── Git phase ─────────────────────────────────────────────────────────────
|
|
|
|
def commit_changes(self, message: str) -> bool:
|
|
"""Stage and commit all changes. Returns True on success."""
|
|
try:
|
|
subprocess.run(
|
|
["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
|
|
)
|
|
subprocess.run(
|
|
["git", "commit", "-m", message],
|
|
cwd=str(self.workspace),
|
|
check=True,
|
|
timeout=30,
|
|
)
|
|
return True
|
|
except subprocess.CalledProcessError as exc:
|
|
logger.warning("Git commit failed: %s", exc)
|
|
return False
|
|
|
|
def revert_changes(self) -> bool:
|
|
"""Revert all uncommitted changes. Returns True on success."""
|
|
try:
|
|
subprocess.run(
|
|
["git", "checkout", "--", "."],
|
|
cwd=str(self.workspace),
|
|
check=True,
|
|
timeout=30,
|
|
)
|
|
return True
|
|
except subprocess.CalledProcessError as exc:
|
|
logger.warning("Git revert failed: %s", exc)
|
|
return False
|