This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/tests/timmy/test_autoresearch_perplexity.py
Alexander Whitestone 9d78eb31d1 ruff (#169)
* polish: streamline nav, extract inline styles, improve tablet UX

- Restructure desktop nav from 8+ flat links + overflow dropdown into
  5 grouped dropdowns (Core, Agents, Intel, System, More) matching
  the mobile menu structure to reduce decision fatigue
- Extract all inline styles from mission_control.html and base.html
  notification elements into mission-control.css with semantic classes
- Replace JS-built innerHTML with secure DOM construction in
  notification loader and chat history
- Add CONNECTING state to connection indicator (amber) instead of
  showing OFFLINE before WebSocket connects
- Add tablet breakpoint (1024px) with larger touch targets for
  Apple Pencil / stylus use and safe-area padding for iPad toolbar
- Add active-link highlighting in desktop dropdown menus
- Rename "Mission Control" page title to "System Overview" to
  disambiguate from the chat home page
- Add "Home — Timmy Time" page title to index.html

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* fix(security): move auth-gate credentials to environment variables

Hardcoded username, password, and HMAC secret in auth-gate.py replaced
with os.environ lookups. Startup now refuses to run if any variable is
unset. Added AUTH_GATE_SECRET/USER/PASS to .env.example.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

* refactor(tooling): migrate from black+isort+bandit to ruff

Replace three separate linting/formatting tools with a single ruff
invocation. Updates tox.ini (lint, format, pre-push, pre-commit envs),
.pre-commit-config.yaml, and CI workflow. Fixes all ruff errors
including unused imports, missing raise-from, and undefined names.
Ruff config maps existing bandit skips to equivalent S-rules.

https://claude.ai/code/session_015uPUoKyYa8M2UAcyk5Gt6h

---------

Co-authored-by: Claude <noreply@anthropic.com>
2026-03-11 12:23:35 -04:00

244 lines
9.2 KiB
Python

"""Test plan for using the autoresearch module with perplexity as the target metric.
Perplexity is a standard LM evaluation metric (lower = better), so the existing
evaluate_result direction logic (lower-is-better) is correct without changes.
The tests below verify every layer of the stack — metric extraction, evaluation,
run integration, config override, tool wiring, and dashboard rendering — works
correctly when `autoresearch_metric` is set to ``perplexity``.
"""
from unittest.mock import MagicMock, patch
import pytest
# ── 1. Metric extraction ────────────────────────────────────────────────
class TestExtractPerplexity:
"""_extract_metric must find 'perplexity' values in training output."""
def test_extracts_perplexity_value(self):
from timmy.autoresearch import _extract_metric
output = "step 500 perplexity: 42.31\nstep 1000 perplexity: 38.05"
assert _extract_metric(output, "perplexity") == pytest.approx(38.05)
def test_extracts_last_occurrence(self):
from timmy.autoresearch import _extract_metric
output = "perplexity: 100.0\nperplexity: 80.5\nperplexity: 55.2\n"
assert _extract_metric(output, "perplexity") == pytest.approx(55.2)
def test_handles_integer_perplexity(self):
from timmy.autoresearch import _extract_metric
output = "perplexity: 42"
assert _extract_metric(output, "perplexity") == pytest.approx(42.0)
def test_handles_space_separator(self):
"""Some training scripts use 'perplexity 38.5' without a colon."""
from timmy.autoresearch import _extract_metric
output = "perplexity 38.5"
assert _extract_metric(output, "perplexity") == pytest.approx(38.5)
def test_returns_none_when_absent(self):
from timmy.autoresearch import _extract_metric
assert _extract_metric("loss: 0.45", "perplexity") is None
def test_ignores_unrelated_numbers(self):
from timmy.autoresearch import _extract_metric
output = "step 500 lr 0.001 loss 2.3\nperplexity: 50.1"
assert _extract_metric(output, "perplexity") == pytest.approx(50.1)
# ── 2. Evaluation with perplexity ───────────────────────────────────────
class TestEvaluatePerplexity:
"""evaluate_result should treat lower perplexity as an improvement."""
def test_lower_is_improvement(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(35.0, 42.0, metric_name="perplexity")
assert "improvement" in result.lower()
assert "perplexity" in result.lower()
def test_higher_is_regression(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(50.0, 42.0, metric_name="perplexity")
assert "regression" in result.lower()
def test_equal_is_no_change(self):
from timmy.autoresearch import evaluate_result
result = evaluate_result(42.0, 42.0, metric_name="perplexity")
assert "no change" in result.lower()
def test_percentage_is_correct(self):
from timmy.autoresearch import evaluate_result
# 40 -> 30 is a -25% change
result = evaluate_result(30.0, 40.0, metric_name="perplexity")
assert "-25.00%" in result
# ── 3. run_experiment with perplexity ───────────────────────────────────
class TestRunExperimentPerplexity:
"""run_experiment must extract perplexity when metric_name='perplexity'."""
def test_extracts_perplexity_from_stdout(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["success"] is True
assert result["metric"] == pytest.approx(61.2)
assert result["error"] is None
def test_returns_none_when_no_perplexity_logged(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('done')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="training complete, no metrics",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["success"] is True
assert result["metric"] is None
def test_extracts_from_stderr_too(self, tmp_path):
"""Some frameworks log metrics to stderr."""
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("import sys")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="",
stderr="perplexity: 72.3",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["metric"] == pytest.approx(72.3)
# ── 4. Config override ─────────────────────────────────────────────────
class TestPerplexityConfig:
"""Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""
def test_config_accepts_perplexity(self):
try:
from config import settings
except ImportError:
pytest.skip("pydantic_settings not installed")
original = settings.autoresearch_metric
try:
settings.autoresearch_metric = "perplexity"
assert settings.autoresearch_metric == "perplexity"
finally:
settings.autoresearch_metric = original
def test_run_experiment_receives_custom_metric(self, tmp_path):
"""Verify run_experiment correctly uses 'perplexity' metric_name."""
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('hi')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="perplexity: 44.2\nval_bpb: 1.05",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
# Must pick perplexity, not val_bpb
assert result["metric"] == pytest.approx(44.2)
# ── 5. History with perplexity values ───────────────────────────────────
class TestPerplexityHistory:
"""Experiment history stores and retrieves perplexity values correctly."""
def test_round_trips_through_jsonl(self, tmp_path):
from timmy.autoresearch import _append_result, get_experiment_history
_append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
_append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})
history = get_experiment_history(tmp_path)
assert len(history) == 2
assert history[0]["metric"] == 61.2 # most recent first
assert history[1]["metric"] == 85.4
def test_mixed_metric_history(self, tmp_path):
"""History may contain runs with different metrics (migration case)."""
from timmy.autoresearch import _append_result, get_experiment_history
_append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
_append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})
history = get_experiment_history(tmp_path)
assert len(history) == 2
assert history[0]["metric_name"] == "perplexity"
# ── 6. Dashboard route with perplexity ──────────────────────────────────
class TestExperimentsRoutePerplexity:
"""The /experiments dashboard renders correctly with perplexity metric."""
def test_displays_perplexity_metric_name(self, client):
try:
from config import settings
except ImportError:
pytest.skip("pydantic_settings not installed")
original = settings.autoresearch_metric
try:
settings.autoresearch_metric = "perplexity"
settings.autoresearch_enabled = True
with patch("timmy.autoresearch.get_experiment_history", return_value=[]):
resp = client.get("/experiments")
assert resp.status_code == 200
finally:
settings.autoresearch_metric = original