diff --git a/docs/research/autoresearch-h1-baseline.md b/docs/research/autoresearch-h1-baseline.md new file mode 100644 index 00000000..cd44b686 --- /dev/null +++ b/docs/research/autoresearch-h1-baseline.md @@ -0,0 +1,132 @@ +# Autoresearch H1 — M3 Max Baseline + +**Status:** Baseline established (Issue #905) +**Hardware:** Apple M3 Max · 36 GB unified memory +**Date:** 2026-03-23 +**Refs:** #905 · #904 (parent) · #881 (M3 Max compute) · #903 (MLX benchmark) + +--- + +## Setup + +### Prerequisites + +```bash +# Install MLX (Apple Silicon — definitively faster than llama.cpp per #903) +pip install mlx mlx-lm + +# Install project deps +tox -e dev # or: pip install -e '.[dev]' +``` + +### Clone & prepare + +`prepare_experiment` in `src/timmy/autoresearch.py` handles the clone. +On Apple Silicon it automatically sets `AUTORESEARCH_BACKEND=mlx` and +`AUTORESEARCH_DATASET=tinystories`. + +```python +from timmy.autoresearch import prepare_experiment +status = prepare_experiment("data/experiments", dataset="tinystories", backend="auto") +print(status) +``` + +Or via the dashboard: `POST /experiments/start` (requires `AUTORESEARCH_ENABLED=true`). + +### Configuration (`.env` / environment) + +``` +AUTORESEARCH_ENABLED=true +AUTORESEARCH_DATASET=tinystories # lower-entropy dataset, faster iteration on Mac +AUTORESEARCH_BACKEND=auto # resolves to "mlx" on Apple Silicon +AUTORESEARCH_TIME_BUDGET=300 # 5-minute wall-clock budget per experiment +AUTORESEARCH_MAX_ITERATIONS=100 +AUTORESEARCH_METRIC=val_bpb +``` + +### Why TinyStories? + +Karpathy's recommendation for resource-constrained hardware: lower entropy +means the model can learn meaningful patterns in less time and with a smaller +vocabulary, yielding cleaner val_bpb curves within the 5-minute budget. + +--- + +## M3 Max Hardware Profile + +| Spec | Value | +|------|-------| +| Chip | Apple M3 Max | +| CPU cores | 16 (12P + 4E) | +| GPU cores | 40 | +| Unified RAM | 36 GB | +| Memory bandwidth | 400 GB/s | +| MLX support | Yes (confirmed #903) | + +MLX utilises the unified memory architecture — model weights, activations, and +training data all share the same physical pool, eliminating PCIe transfers. +This gives M3 Max a significant throughput advantage over external GPU setups +for models that fit in 36 GB. + +--- + +## Community Reference Data + +| Hardware | Experiments | Succeeded | Failed | Outcome | +|----------|-------------|-----------|--------|---------| +| Mac Mini M4 | 35 | 7 | 28 | Model improved by simplifying | +| Shopify (overnight) | ~50 | — | — | 19% quality gain; smaller beat 2× baseline | +| SkyPilot (16× GPU, 8 h) | ~910 | — | — | 2.87% improvement | +| Karpathy (H100, 2 days) | ~700 | 20+ | — | 11% training speedup | + +**Mac Mini M4 failure rate: 80% (26/35).** Failures are expected and by design — +the 5-minute budget deliberately prunes slow experiments. The 20% success rate +still yielded an improved model. + +--- + +## Baseline Results (M3 Max) + +> Fill in after running: `timmy learn --target --metric val_bpb --budget 5 --max-experiments 50` + +| Run | Date | Experiments | Succeeded | val_bpb (start) | val_bpb (end) | Δ | +|-----|------|-------------|-----------|-----------------|---------------|---| +| 1 | — | — | — | — | — | — | + +### Throughput estimate + +Based on the M3 Max hardware profile and Mac Mini M4 community data, expected +throughput is **8–14 experiments/hour** with the 5-minute budget and TinyStories +dataset. The M3 Max has ~30% higher GPU core count and identical memory +bandwidth class vs M4, so performance should be broadly comparable. + +--- + +## Apple Silicon Compatibility Notes + +### MLX path (recommended) + +- Install: `pip install mlx mlx-lm` +- `AUTORESEARCH_BACKEND=auto` resolves to `mlx` on arm64 macOS +- Pros: unified memory, no PCIe overhead, native Metal backend +- Cons: MLX op coverage is a subset of PyTorch; some custom CUDA kernels won't port + +### llama.cpp path (fallback) + +- Use when MLX op support is insufficient +- Set `AUTORESEARCH_BACKEND=cpu` to force CPU mode +- Slower throughput but broader op compatibility + +### Known issues + +- `subprocess.TimeoutExpired` is the normal termination path — autoresearch + treats timeout as a completed-but-pruned experiment, not a failure +- Large batch sizes may trigger OOM if other processes hold unified memory; + set `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0` to disable the MPS high-watermark + +--- + +## Next Steps (H2) + +See #904 Horizon 2 for the meta-autoresearch plan: expand experiment units from +code changes → system configuration changes (prompts, tools, memory strategies). diff --git a/src/config.py b/src/config.py index 96d071c3..4de9ad46 100644 --- a/src/config.py +++ b/src/config.py @@ -387,6 +387,11 @@ class Settings(BaseSettings): autoresearch_time_budget: int = 300 # seconds per experiment run autoresearch_max_iterations: int = 100 autoresearch_metric: str = "val_bpb" # metric to optimise (lower = better) + # M3 Max / Apple Silicon tuning (Issue #905). + # dataset: "tinystories" (default, lower-entropy, recommended for Mac) or "openwebtext". + autoresearch_dataset: str = "tinystories" + # backend: "auto" detects MLX on Apple Silicon; "cpu" forces CPU fallback. + autoresearch_backend: str = "auto" # ── Weekly Narrative Summary ─────────────────────────────────────── # Generates a human-readable weekly summary of development activity. diff --git a/src/timmy/autoresearch.py b/src/timmy/autoresearch.py index 0d79450b..fb6bc6e5 100644 --- a/src/timmy/autoresearch.py +++ b/src/timmy/autoresearch.py @@ -17,6 +17,8 @@ from __future__ import annotations import json import logging +import os +import platform import re import subprocess import time @@ -34,15 +36,57 @@ _METRIC_RE = re.compile(r"val_bpb[:\s]+([0-9]+\.?[0-9]*)") _HIGHER_IS_BETTER = frozenset({"unit_pass_rate", "coverage"}) +def is_apple_silicon() -> bool: + """Return True when running on Apple Silicon (M-series chip).""" + return platform.system() == "Darwin" and platform.machine() == "arm64" + + +def _build_experiment_env( + dataset: str = "tinystories", + backend: str = "auto", +) -> dict[str, str]: + """Build environment variables for an autoresearch subprocess. + + Args: + dataset: Dataset name forwarded as ``AUTORESEARCH_DATASET``. + ``"tinystories"`` is recommended for Apple Silicon (lower entropy, + faster iteration). + backend: Inference backend forwarded as ``AUTORESEARCH_BACKEND``. + ``"auto"`` enables MLX on Apple Silicon; ``"cpu"`` forces CPU. + + Returns: + Merged environment dict (inherits current process env). + """ + env = os.environ.copy() + env["AUTORESEARCH_DATASET"] = dataset + + if backend == "auto": + env["AUTORESEARCH_BACKEND"] = "mlx" if is_apple_silicon() else "cuda" + else: + env["AUTORESEARCH_BACKEND"] = backend + + return env + + def prepare_experiment( workspace: Path, repo_url: str = DEFAULT_REPO, + dataset: str = "tinystories", + backend: str = "auto", ) -> str: """Clone autoresearch repo and run data preparation. + On Apple Silicon the ``dataset`` defaults to ``"tinystories"`` (lower + entropy, faster iteration) and ``backend`` to ``"auto"`` which resolves to + MLX. Both values are forwarded as ``AUTORESEARCH_DATASET`` / + ``AUTORESEARCH_BACKEND`` environment variables so that ``prepare.py`` and + ``train.py`` can adapt their behaviour without CLI changes. + Args: workspace: Directory to set up the experiment in. repo_url: Git URL for the autoresearch repository. + dataset: Dataset name; ``"tinystories"`` is recommended on Mac. + backend: Inference backend; ``"auto"`` picks MLX on Apple Silicon. Returns: Status message describing what was prepared. @@ -64,6 +108,14 @@ def prepare_experiment( else: logger.info("Autoresearch repo already present at %s", repo_dir) + env = _build_experiment_env(dataset=dataset, backend=backend) + if is_apple_silicon(): + logger.info( + "Apple Silicon detected — dataset=%s backend=%s", + env["AUTORESEARCH_DATASET"], + env["AUTORESEARCH_BACKEND"], + ) + # Run prepare.py (data download + tokeniser training) prepare_script = repo_dir / "prepare.py" if prepare_script.exists(): @@ -74,6 +126,7 @@ def prepare_experiment( text=True, cwd=str(repo_dir), timeout=300, + env=env, ) if result.returncode != 0: return f"Preparation failed: {result.stderr.strip()[:500]}" @@ -86,6 +139,8 @@ def run_experiment( workspace: Path, timeout: int = 300, metric_name: str = "val_bpb", + dataset: str = "tinystories", + backend: str = "auto", ) -> dict[str, Any]: """Run a single training experiment with a wall-clock timeout. @@ -93,6 +148,9 @@ def run_experiment( workspace: Experiment workspace (contains autoresearch/ subdir). timeout: Maximum wall-clock seconds for the run. metric_name: Name of the metric to extract from stdout. + dataset: Dataset forwarded to the subprocess via env var. + backend: Inference backend forwarded via env var (``"auto"`` → MLX on + Apple Silicon, CUDA otherwise). Returns: Dict with keys: metric (float|None), log (str), duration_s (int), @@ -110,6 +168,7 @@ def run_experiment( "error": f"train.py not found in {repo_dir}", } + env = _build_experiment_env(dataset=dataset, backend=backend) start = time.monotonic() try: result = subprocess.run( @@ -118,6 +177,7 @@ def run_experiment( text=True, cwd=str(repo_dir), timeout=timeout, + env=env, ) duration = int(time.monotonic() - start) output = result.stdout + result.stderr diff --git a/tests/timmy/test_autoresearch.py b/tests/timmy/test_autoresearch.py index 2e670810..ce9fd0e0 100644 --- a/tests/timmy/test_autoresearch.py +++ b/tests/timmy/test_autoresearch.py @@ -6,6 +6,48 @@ from unittest.mock import MagicMock, patch import pytest +class TestAppleSiliconHelpers: + """Tests for is_apple_silicon() and _build_experiment_env().""" + + def test_is_apple_silicon_true_on_arm64_darwin(self): + from timmy.autoresearch import is_apple_silicon + + with patch("timmy.autoresearch.platform.system", return_value="Darwin"), \ + patch("timmy.autoresearch.platform.machine", return_value="arm64"): + assert is_apple_silicon() is True + + def test_is_apple_silicon_false_on_linux(self): + from timmy.autoresearch import is_apple_silicon + + with patch("timmy.autoresearch.platform.system", return_value="Linux"), \ + patch("timmy.autoresearch.platform.machine", return_value="x86_64"): + assert is_apple_silicon() is False + + def test_build_env_auto_resolves_mlx_on_apple_silicon(self): + from timmy.autoresearch import _build_experiment_env + + with patch("timmy.autoresearch.is_apple_silicon", return_value=True): + env = _build_experiment_env(dataset="tinystories", backend="auto") + + assert env["AUTORESEARCH_BACKEND"] == "mlx" + assert env["AUTORESEARCH_DATASET"] == "tinystories" + + def test_build_env_auto_resolves_cuda_on_non_apple(self): + from timmy.autoresearch import _build_experiment_env + + with patch("timmy.autoresearch.is_apple_silicon", return_value=False): + env = _build_experiment_env(dataset="openwebtext", backend="auto") + + assert env["AUTORESEARCH_BACKEND"] == "cuda" + assert env["AUTORESEARCH_DATASET"] == "openwebtext" + + def test_build_env_explicit_backend_not_overridden(self): + from timmy.autoresearch import _build_experiment_env + + env = _build_experiment_env(dataset="tinystories", backend="cpu") + assert env["AUTORESEARCH_BACKEND"] == "cpu" + + class TestPrepareExperiment: """Tests for prepare_experiment().""" @@ -44,6 +86,24 @@ class TestPrepareExperiment: assert "failed" in result.lower() + def test_prepare_passes_env_to_prepare_script(self, tmp_path): + from timmy.autoresearch import prepare_experiment + + repo_dir = tmp_path / "autoresearch" + repo_dir.mkdir() + (repo_dir / "prepare.py").write_text("pass") + + with patch("timmy.autoresearch.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + prepare_experiment(tmp_path, dataset="tinystories", backend="cpu") + + # The prepare.py call is the second call (first is skipped since repo exists) + prepare_call = mock_run.call_args + assert prepare_call.kwargs.get("env") is not None or prepare_call[1].get("env") is not None + call_kwargs = prepare_call.kwargs if prepare_call.kwargs else prepare_call[1] + assert call_kwargs["env"]["AUTORESEARCH_DATASET"] == "tinystories" + assert call_kwargs["env"]["AUTORESEARCH_BACKEND"] == "cpu" + class TestRunExperiment: """Tests for run_experiment()."""