This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/tests/timmy/test_autoresearch.py
Alexander Whitestone 9d78eb31d1 ruff (#169)
* polish: streamline nav, extract inline styles, improve tablet UX

- Restructure desktop nav from 8+ flat links + overflow dropdown into
  5 grouped dropdowns (Core, Agents, Intel, System, More) matching
  the mobile menu structure to reduce decision fatigue
- Extract all inline styles from mission_control.html and base.html
  notification elements into mission-control.css with semantic classes
- Replace JS-built innerHTML with secure DOM construction in
  notification loader and chat history
- Add CONNECTING state to connection indicator (amber) instead of
  showing OFFLINE before WebSocket connects
- Add tablet breakpoint (1024px) with larger touch targets for
  Apple Pencil / stylus use and safe-area padding for iPad toolbar
- Add active-link highlighting in desktop dropdown menus
- Rename "Mission Control" page title to "System Overview" to
  disambiguate from the chat home page
- Add "Home — Timmy Time" page title to index.html

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* fix(security): move auth-gate credentials to environment variables

Hardcoded username, password, and HMAC secret in auth-gate.py replaced
with os.environ lookups. Startup now refuses to run if any variable is
unset. Added AUTH_GATE_SECRET/USER/PASS to .env.example.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* refactor(tooling): migrate from black+isort+bandit to ruff

Replace three separate linting/formatting tools with a single ruff
invocation. Updates tox.ini (lint, format, pre-push, pre-commit envs),
.pre-commit-config.yaml, and CI workflow. Fixes all ruff errors
including unused imports, missing raise-from, and undefined names.
Ruff config maps existing bandit skips to equivalent S-rules.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-03-11 12:23:35 -04:00

179 lines
5.8 KiB
Python

"""Tests for the autoresearch module — autonomous ML experiment loops."""
import json
from unittest.mock import MagicMock, patch
import pytest
class TestPrepareExperiment:
"""Tests for prepare_experiment()."""
def test_clones_repo_when_not_present(self, tmp_path):
from timmy.autoresearch import prepare_experiment
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
prepare_experiment(tmp_path, "https://example.com/repo.git")
assert mock_run.call_count >= 1
clone_call = mock_run.call_args_list[0]
assert "git" in clone_call.args[0]
assert "clone" in clone_call.args[0]
def test_skips_clone_when_present(self, tmp_path):
from timmy.autoresearch import prepare_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
prepare_experiment(tmp_path)
# Should not call git clone
if mock_run.called:
assert "clone" not in str(mock_run.call_args_list[0])
def test_clone_failure_returns_error(self, tmp_path):
from timmy.autoresearch import prepare_experiment
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="auth failed")
result = prepare_experiment(tmp_path)
assert "failed" in result.lower()
class TestRunExperiment:
"""Tests for run_experiment()."""
def test_successful_run_extracts_metric(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="step 1000 val_bpb: 1.2345\nstep 2000 val_bpb: 1.1234",
stderr="",
)
result = run_experiment(tmp_path, timeout=60)
assert result["success"] is True
assert result["metric"] == pytest.approx(1.1234)
assert result["error"] is None
def test_timeout_returns_error(self, tmp_path):
import subprocess
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(cmd="python", timeout=5)
result = run_experiment(tmp_path, timeout=5)
assert result["success"] is False
assert "timed out" in result["error"].lower()
def test_missing_train_py(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
# No train.py
result = run_experiment(tmp_path)
assert result["success"] is False
assert "not found" in result["error"].lower()
def test_no_metric_in_output(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('done')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="no metrics here", stderr="")
result = run_experiment(tmp_path)
assert result["success"] is True
assert result["metric"] is None
class TestEvaluateResult:
"""Tests for evaluate_result()."""
def test_improvement_detected(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.10, 1.20)
assert "improvement" in result.lower()
def test_regression_detected(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.30, 1.20)
assert "regression" in result.lower()
def test_no_change(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(1.20, 1.20)
assert "no change" in result.lower()
class TestExperimentHistory:
"""Tests for get_experiment_history()."""
def test_empty_workspace(self, tmp_path):
from timmy.autoresearch import get_experiment_history
history = get_experiment_history(tmp_path)
assert history == []
def test_reads_jsonl(self, tmp_path):
from timmy.autoresearch import get_experiment_history
results_file = tmp_path / "results.jsonl"
results_file.write_text(
json.dumps({"metric": 1.2, "success": True})
+ "\n"
+ json.dumps({"metric": 1.1, "success": True})
+ "\n"
)
history = get_experiment_history(tmp_path)
assert len(history) == 2
# Most recent first
assert history[0]["metric"] == 1.1
class TestExtractMetric:
"""Tests for _extract_metric()."""
def test_extracts_last_value(self):
from timmy.autoresearch import _extract_metric
output = "val_bpb: 1.5\nval_bpb: 1.3\nval_bpb: 1.1"
assert _extract_metric(output) == pytest.approx(1.1)
def test_no_match_returns_none(self):
from timmy.autoresearch import _extract_metric
assert _extract_metric("no metrics here") is None
def test_custom_metric_name(self):
from timmy.autoresearch import _extract_metric
output = "loss: 0.45\nloss: 0.32"
assert _extract_metric(output, "loss") == pytest.approx(0.32)