Timmy-time-dashboard/src/timmy/autoresearch.py

"""Autoresearch — autonomous ML experiment loops.

Integrates Karpathy's autoresearch pattern: an agent modifies training
code, runs time-boxed GPU experiments, evaluates a target metric
(val_bpb by default), and iterates to find improvements.

Flow:
  1. prepare_experiment  — clone repo + run data prep
  2. run_experiment      — execute train.py with wall-clock timeout
  3. evaluate_result     — compare metric against baseline
  4. experiment_loop     — orchestrate the full cycle

All subprocess calls are guarded with timeouts for graceful degradation.
"""

from __future__ import annotations

import json
import logging
import re
import subprocess
import time
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)

DEFAULT_REPO = "https://github.com/karpathy/autoresearch.git"
_METRIC_RE = re.compile(r"val_bpb[:\s]+([0-9]+\.?[0-9]*)")


def prepare_experiment(
    workspace: Path,
    repo_url: str = DEFAULT_REPO,
) -> str:
    """Clone autoresearch repo and run data preparation.

    Args:
        workspace: Directory to set up the experiment in.
        repo_url: Git URL for the autoresearch repository.

    Returns:
        Status message describing what was prepared.
    """
    workspace = Path(workspace)
    workspace.mkdir(parents=True, exist_ok=True)

    repo_dir = workspace / "autoresearch"
    if not repo_dir.exists():
        logger.info("Cloning autoresearch into %s", repo_dir)
        result = subprocess.run(
            ["git", "clone", "--depth", "1", repo_url, str(repo_dir)],
            capture_output=True,
            text=True,
            timeout=120,
        )
        if result.returncode != 0:
            return f"Clone failed: {result.stderr.strip()}"
    else:
        logger.info("Autoresearch repo already present at %s", repo_dir)

    # Run prepare.py (data download + tokeniser training)
    prepare_script = repo_dir / "prepare.py"
    if prepare_script.exists():
        logger.info("Running prepare.py …")
        result = subprocess.run(
            ["python", str(prepare_script)],
            capture_output=True,
            text=True,
            cwd=str(repo_dir),
            timeout=300,
        )
        if result.returncode != 0:
            return f"Preparation failed: {result.stderr.strip()[:500]}"
        return "Preparation complete — data downloaded and tokeniser trained."

    return "Preparation skipped — no prepare.py found."


def run_experiment(
    workspace: Path,
    timeout: int = 300,
    metric_name: str = "val_bpb",
) -> dict[str, Any]:
    """Run a single training experiment with a wall-clock timeout.

    Args:
        workspace: Experiment workspace (contains autoresearch/ subdir).
        timeout: Maximum wall-clock seconds for the run.
        metric_name: Name of the metric to extract from stdout.

    Returns:
        Dict with keys: metric (float|None), log (str), duration_s (int),
        success (bool), error (str|None).
    """
    repo_dir = Path(workspace) / "autoresearch"
    train_script = repo_dir / "train.py"

    if not train_script.exists():
        return {
            "metric": None,
            "log": "",
            "duration_s": 0,
            "success": False,
            "error": f"train.py not found in {repo_dir}",
        }

    start = time.monotonic()
    try:
        result = subprocess.run(
            ["python", str(train_script)],
            capture_output=True,
            text=True,
            cwd=str(repo_dir),
            timeout=timeout,
        )
        duration = int(time.monotonic() - start)
        output = result.stdout + result.stderr

        # Extract metric from output
        metric_val = _extract_metric(output, metric_name)

        return {
            "metric": metric_val,
            "log": output[-2000:],  # Keep last 2k chars
            "duration_s": duration,
            "success": result.returncode == 0,
            "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
        }
    except subprocess.TimeoutExpired:
        duration = int(time.monotonic() - start)
        return {
            "metric": None,
            "log": f"Experiment timed out after {timeout}s",
            "duration_s": duration,
            "success": False,
            "error": f"Timed out after {timeout}s",
        }
    except OSError as exc:
        return {
            "metric": None,
            "log": "",
            "duration_s": 0,
            "success": False,
            "error": str(exc),
        }


def _extract_metric(output: str, metric_name: str = "val_bpb") -> float | None:
    """Extract the last occurrence of a metric value from training output."""
    pattern = re.compile(rf"{re.escape(metric_name)}[:\s]+([0-9]+\.?[0-9]*)")
    matches = pattern.findall(output)
    if matches:
        try:
            return float(matches[-1])
        except ValueError:
            pass
    return None


def evaluate_result(
    current: float,
    baseline: float,
    metric_name: str = "val_bpb",
) -> str:
    """Compare a metric against baseline and return an assessment.

    For val_bpb, lower is better.

    Args:
        current: Current experiment's metric value.
        baseline: Baseline metric to compare against.
        metric_name: Name of the metric (for display).

    Returns:
        Human-readable assessment string.
    """
    delta = current - baseline
    pct = (delta / baseline) * 100 if baseline != 0 else 0.0

    if delta < 0:
        return f"Improvement: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
    elif delta > 0:
        return f"Regression: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
    else:
        return f"No change: {metric_name} = {current:.4f}"


def get_experiment_history(workspace: Path) -> list[dict[str, Any]]:
    """Read experiment history from the workspace results file.

    Returns:
        List of experiment result dicts, most recent first.
    """
    results_file = Path(workspace) / "results.jsonl"
    if not results_file.exists():
        return []

    history: list[dict[str, Any]] = []
    for line in results_file.read_text().strip().splitlines():
        try:
            history.append(json.loads(line))
        except json.JSONDecodeError:
            continue

    return list(reversed(history))


def _append_result(workspace: Path, result: dict[str, Any]) -> None:
    """Append a result to the workspace JSONL log."""
    results_file = Path(workspace) / "results.jsonl"
    results_file.parent.mkdir(parents=True, exist_ok=True)
    with results_file.open("a") as f:
        f.write(json.dumps(result) + "\n")


def _extract_pass_rate(output: str) -> float | None:
    """Extract pytest pass rate as a percentage from tox/pytest output."""
    passed_m = re.search(r"(\d+) passed", output)
    failed_m = re.search(r"(\d+) failed", output)
    if passed_m:
        passed = int(passed_m.group(1))
        failed = int(failed_m.group(1)) if failed_m else 0
        total = passed + failed
        return (passed / total * 100.0) if total > 0 else 100.0
    return None


def _extract_coverage(output: str) -> float | None:
    """Extract total coverage percentage from coverage output."""
    coverage_m = re.search(r"(?:TOTAL\s+\d+\s+\d+\s+|Total coverage:\s*)(\d+)%", output)
    if coverage_m:
        try:
            return float(coverage_m.group(1))
        except ValueError:
            pass
    return None


# ── Higher-is-better metric names ────────────────────────────────────────────
_HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"})


class SystemExperiment:
    """An autoresearch experiment targeting a specific module with a configurable metric.

    Encapsulates the hypothesis → edit → tox → evaluate → commit/revert loop
    for a single target file or module.

    Args:
        target: Path or module name to optimise (e.g. ``src/timmy/agent.py``).
        metric: Metric to extract from tox output.  Built-in values:
            ``unit_pass_rate`` (default), ``coverage``, ``val_bpb``.
            Any other value is forwarded to :func:`_extract_metric`.
        budget_minutes: Wall-clock budget per experiment (default 5 min).
        workspace: Working directory for subprocess calls.  Defaults to ``cwd``.
    """

    def __init__(
        self,
        target: str,
        metric: str = "unit_pass_rate",
        budget_minutes: int = 5,
        workspace: Path | None = None,
    ) -> None:
        self.target = target
        self.metric = metric
        self.budget_seconds = budget_minutes * 60
        self.workspace = Path(workspace) if workspace else Path.cwd()

    # ── Hypothesis generation ─────────────────────────────────────────────────

    def generate_hypothesis(self, program_content: str = "") -> str:
        """Return a plain-English hypothesis for the next experiment.

        Uses the first non-empty line of *program_content* when available;
        falls back to a generic description based on target and metric.
        """
        first_line = ""
        for line in program_content.splitlines():
            stripped = line.strip()
            if stripped and not stripped.startswith("#"):
                first_line = stripped[:120]
                break
        if first_line:
            return f"[{self.target}] {first_line}"
        return f"Improve {self.metric} for {self.target}"

    # ── Edit phase ────────────────────────────────────────────────────────────

    def apply_edit(self, hypothesis: str, model: str = "qwen3:30b") -> str:
        """Apply code edits to *target* via Aider.

        Returns a status string.  Degrades gracefully — never raises.
        """
        prompt = f"Edit {self.target}: {hypothesis}"
        try:
            result = subprocess.run(
                ["aider", "--no-git", "--model", f"ollama/{model}", "--quiet", prompt],
                capture_output=True,
                text=True,
                timeout=self.budget_seconds,
                cwd=str(self.workspace),
            )
            if result.returncode == 0:
                return result.stdout or "Edit applied."
            return f"Aider error (exit {result.returncode}): {result.stderr[:500]}"
        except FileNotFoundError:
            logger.warning("Aider not installed — edit skipped")
            return "Aider not available — edit skipped"
        except subprocess.TimeoutExpired:
            logger.warning("Aider timed out after %ds", self.budget_seconds)
            return "Aider timed out"
        except (OSError, subprocess.SubprocessError) as exc:
            logger.warning("Aider failed: %s", exc)
            return f"Edit failed: {exc}"

    # ── Evaluation phase ──────────────────────────────────────────────────────

    def run_tox(self, tox_env: str = "unit") -> dict[str, Any]:
        """Run *tox_env* and return a result dict.

        Returns:
            Dict with keys: ``metric`` (float|None), ``log`` (str),
            ``duration_s`` (int), ``success`` (bool), ``error`` (str|None).
        """
        start = time.monotonic()
        try:
            result = subprocess.run(
                ["tox", "-e", tox_env],
                capture_output=True,
                text=True,
                timeout=self.budget_seconds,
                cwd=str(self.workspace),
            )
            duration = int(time.monotonic() - start)
            output = result.stdout + result.stderr
            metric_val = self._extract_tox_metric(output)
            return {
                "metric": metric_val,
                "log": output[-3000:],
                "duration_s": duration,
                "success": result.returncode == 0,
                "error": None if result.returncode == 0 else f"Exit code {result.returncode}",
            }
        except subprocess.TimeoutExpired:
            duration = int(time.monotonic() - start)
            return {
                "metric": None,
                "log": f"Budget exceeded after {self.budget_seconds}s",
                "duration_s": duration,
                "success": False,
                "error": f"Budget exceeded after {self.budget_seconds}s",
            }
        except OSError as exc:
            return {"metric": None, "log": "", "duration_s": 0, "success": False, "error": str(exc)}

    def _extract_tox_metric(self, output: str) -> float | None:
        """Dispatch to the correct metric extractor based on *self.metric*."""
        if self.metric == "unit_pass_rate":
            return _extract_pass_rate(output)
        if self.metric == "coverage":
            return _extract_coverage(output)
        return _extract_metric(output, self.metric)

    def evaluate(self, current: float | None, baseline: float | None) -> str:
        """Compare *current* metric against *baseline* and return an assessment."""
        if current is None:
            return "Indeterminate: metric not extracted from output"
        if baseline is None:
            unit = "%" if self.metric in _HIGHER_IS_BETTER else ""
            return f"Baseline: {self.metric} = {current:.2f}{unit}"

        if self.metric in _HIGHER_IS_BETTER:
            delta = current - baseline
            pct = (delta / baseline * 100) if baseline != 0 else 0.0
            if delta > 0:
                return f"Improvement: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
            if delta < 0:
                return f"Regression: {self.metric} {baseline:.2f}% → {current:.2f}% ({pct:+.2f}%)"
            return f"No change: {self.metric} = {current:.2f}%"

        # lower-is-better (val_bpb, loss, etc.)
        return evaluate_result(current, baseline, self.metric)

    def is_improvement(self, current: float, baseline: float) -> bool:
        """Return True if *current* is better than *baseline* for this metric."""
        if self.metric in _HIGHER_IS_BETTER:
            return current > baseline
        return current < baseline  # lower-is-better

    # ── Git phase ─────────────────────────────────────────────────────────────

    def commit_changes(self, message: str) -> bool:
        """Stage and commit all changes.  Returns True on success."""
        try:
            subprocess.run(
                ["git", "add", "-A"], cwd=str(self.workspace), check=True, timeout=30
            )
            subprocess.run(
                ["git", "commit", "-m", message],
                cwd=str(self.workspace),
                check=True,
                timeout=30,
            )
            return True
        except subprocess.CalledProcessError as exc:
            logger.warning("Git commit failed: %s", exc)
            return False

    def revert_changes(self) -> bool:
        """Revert all uncommitted changes.  Returns True on success."""
        try:
            subprocess.run(
                ["git", "checkout", "--", "."],
                cwd=str(self.workspace),
                check=True,
                timeout=30,
            )
            return True
        except subprocess.CalledProcessError as exc:
            logger.warning("Git revert failed: %s", exc)
            return False