This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/src/timmy/autoresearch.py
Alexander Whitestone 9d78eb31d1 ruff (#169)
* polish: streamline nav, extract inline styles, improve tablet UX

- Restructure desktop nav from 8+ flat links + overflow dropdown into
  5 grouped dropdowns (Core, Agents, Intel, System, More) matching
  the mobile menu structure to reduce decision fatigue
- Extract all inline styles from mission_control.html and base.html
  notification elements into mission-control.css with semantic classes
- Replace JS-built innerHTML with secure DOM construction in
  notification loader and chat history
- Add CONNECTING state to connection indicator (amber) instead of
  showing OFFLINE before WebSocket connects
- Add tablet breakpoint (1024px) with larger touch targets for
  Apple Pencil / stylus use and safe-area padding for iPad toolbar
- Add active-link highlighting in desktop dropdown menus
- Rename "Mission Control" page title to "System Overview" to
  disambiguate from the chat home page
- Add "Home — Timmy Time" page title to index.html

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* fix(security): move auth-gate credentials to environment variables

Hardcoded username, password, and HMAC secret in auth-gate.py replaced
with os.environ lookups. Startup now refuses to run if any variable is
unset. Added AUTH_GATE_SECRET/USER/PASS to .env.example.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* refactor(tooling): migrate from black+isort+bandit to ruff

Replace three separate linting/formatting tools with a single ruff
invocation. Updates tox.ini (lint, format, pre-push, pre-commit envs),
.pre-commit-config.yaml, and CI workflow. Fixes all ruff errors
including unused imports, missing raise-from, and undefined names.
Ruff config maps existing bandit skips to equivalent S-rules.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-03-11 12:23:35 -04:00

215 lines
6.5 KiB
Python

"""Autoresearch — autonomous ML experiment loops.
Integrates Karpathy's autoresearch pattern: an agent modifies training
code, runs time-boxed GPU experiments, evaluates a target metric
(val_bpb by default), and iterates to find improvements.
Flow:
1. prepare_experiment — clone repo + run data prep
2. run_experiment — execute train.py with wall-clock timeout
3. evaluate_result — compare metric against baseline
4. experiment_loop — orchestrate the full cycle
All subprocess calls are guarded with timeouts for graceful degradation.
"""
from __future__ import annotations
import json
import logging
import re
import subprocess
import time
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
DEFAULT_REPO = "https://github.com/karpathy/autoresearch.git"
_METRIC_RE = re.compile(r"val_bpb[:\s]+([0-9]+\.?[0-9]*)")
def prepare_experiment(
workspace: Path,
repo_url: str = DEFAULT_REPO,
) -> str:
"""Clone autoresearch repo and run data preparation.
Args:
workspace: Directory to set up the experiment in.
repo_url: Git URL for the autoresearch repository.
Returns:
Status message describing what was prepared.
"""
workspace = Path(workspace)
workspace.mkdir(parents=True, exist_ok=True)
repo_dir = workspace / "autoresearch"
if not repo_dir.exists():
logger.info("Cloning autoresearch into %s", repo_dir)
result = subprocess.run(
["git", "clone", "--depth", "1", repo_url, str(repo_dir)],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
return f"Clone failed: {result.stderr.strip()}"
else:
logger.info("Autoresearch repo already present at %s", repo_dir)
# Run prepare.py (data download + tokeniser training)
prepare_script = repo_dir / "prepare.py"
if prepare_script.exists():
logger.info("Running prepare.py …")
result = subprocess.run(
["python", str(prepare_script)],
capture_output=True,
text=True,
cwd=str(repo_dir),
timeout=300,
)
if result.returncode != 0:
return f"Preparation failed: {result.stderr.strip()[:500]}"
return "Preparation complete — data downloaded and tokeniser trained."
return "Preparation skipped — no prepare.py found."
def run_experiment(
workspace: Path,
timeout: int = 300,
metric_name: str = "val_bpb",
) -> dict[str, Any]:
"""Run a single training experiment with a wall-clock timeout.
Args:
workspace: Experiment workspace (contains autoresearch/ subdir).
timeout: Maximum wall-clock seconds for the run.
metric_name: Name of the metric to extract from stdout.
Returns:
Dict with keys: metric (float|None), log (str), duration_s (int),
success (bool), error (str|None).
"""
repo_dir = Path(workspace) / "autoresearch"
train_script = repo_dir / "train.py"
if not train_script.exists():
return {
"metric": None,
"log": "",
"duration_s": 0,
"success": False,
"error": f"train.py not found in {repo_dir}",
}
start = time.monotonic()
try:
result = subprocess.run(
["python", str(train_script)],
capture_output=True,
text=True,
cwd=str(repo_dir),
timeout=timeout,
)
duration = int(time.monotonic() - start)
output = result.stdout + result.stderr
# Extract metric from output
metric_val = _extract_metric(output, metric_name)
return {
"metric": metric_val,
"log": output[-2000:], # Keep last 2k chars
"duration_s": duration,
"success": result.returncode == 0,
"error": None if result.returncode == 0 else f"Exit code {result.returncode}",
}
except subprocess.TimeoutExpired:
duration = int(time.monotonic() - start)
return {
"metric": None,
"log": f"Experiment timed out after {timeout}s",
"duration_s": duration,
"success": False,
"error": f"Timed out after {timeout}s",
}
except OSError as exc:
return {
"metric": None,
"log": "",
"duration_s": 0,
"success": False,
"error": str(exc),
}
def _extract_metric(output: str, metric_name: str = "val_bpb") -> float | None:
"""Extract the last occurrence of a metric value from training output."""
pattern = re.compile(rf"{re.escape(metric_name)}[:\s]+([0-9]+\.?[0-9]*)")
matches = pattern.findall(output)
if matches:
try:
return float(matches[-1])
except ValueError:
pass
return None
def evaluate_result(
current: float,
baseline: float,
metric_name: str = "val_bpb",
) -> str:
"""Compare a metric against baseline and return an assessment.
For val_bpb, lower is better.
Args:
current: Current experiment's metric value.
baseline: Baseline metric to compare against.
metric_name: Name of the metric (for display).
Returns:
Human-readable assessment string.
"""
delta = current - baseline
pct = (delta / baseline) * 100 if baseline != 0 else 0.0
if delta < 0:
return f"Improvement: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
elif delta > 0:
return f"Regression: {metric_name} {baseline:.4f} -> {current:.4f} ({pct:+.2f}%)"
else:
return f"No change: {metric_name} = {current:.4f}"
def get_experiment_history(workspace: Path) -> list[dict[str, Any]]:
"""Read experiment history from the workspace results file.
Returns:
List of experiment result dicts, most recent first.
"""
results_file = Path(workspace) / "results.jsonl"
if not results_file.exists():
return []
history: list[dict[str, Any]] = []
for line in results_file.read_text().strip().splitlines():
try:
history.append(json.loads(line))
except json.JSONDecodeError:
continue
return list(reversed(history))
def _append_result(workspace: Path, result: dict[str, Any]) -> None:
"""Append a result to the workspace JSONL log."""
results_file = Path(workspace) / "results.jsonl"
results_file.parent.mkdir(parents=True, exist_ok=True)
with results_file.open("a") as f:
f.write(json.dumps(result) + "\n")