Files
Timmy-time-dashboard/tests/timmy/test_autoresearch_perplexity.py
Alexander Whitestone 36fc10097f
Some checks failed
Tests / lint (push) Failing after 4s
Tests / test (push) Has been skipped
Tests / docker-build (push) Failing after 1s
Claude/angry cerf (#173)
* feat: set qwen3.5:latest as default model

- Make qwen3.5:latest the primary default model for faster inference
- Move llama3.1:8b-instruct to fallback chain
- Update text fallback chain to prioritize qwen3.5:latest

Retains full backward compatibility via cascade fallback.

* test: remove ~55 brittle, duplicate, and useless tests

Audit of all 100 test files identified tests that provided no real
regression protection. Removed:

- 4 files deleted entirely: test_setup_script (always skipped),
  test_csrf_bypass (tautological assertions), test_input_validation
  (accepts 200-500 status codes), test_security_regression (fragile
  source-pattern checks redundant with rendering tests)
- Duplicate test classes (TestToolTracking, TestCalculatorExtended)
- Mock-only tests that just verify mock wiring, not behavior
- Structurally broken tests (TestCreateToolFunctions patches after import)
- Empty/pass-body tests and meaningless assertions (len > 20)
- Flaky subprocess tests (aider tool calling real binary)

All 1328 remaining tests pass. Net: -699 lines, zero coverage loss.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: prevent test pollution from autoresearch_enabled mutation

test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True
but never restoring it in the finally block — polluting subsequent tests.
When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off,
the victim test saw enabled=True and failed to find "Disabled" in the page.

Fix both sides:
- Restore autoresearch_enabled in the finally block (root cause)
- Mock settings explicitly in the victim test (defense in depth)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Trip T <trip@local>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 16:55:27 -04:00

169 lines
6.4 KiB
Python

"""Tests for using the autoresearch module with perplexity as the target metric.
Covers run integration, config override, history, and dashboard rendering when
`autoresearch_metric` is set to ``perplexity``.
Note: metric extraction and evaluation logic are already tested in
test_autoresearch.py — only perplexity-specific integration paths are tested here.
"""
from unittest.mock import MagicMock, patch
import pytest
# ── run_experiment with perplexity ──────────────────────────────────────
class TestRunExperimentPerplexity:
"""run_experiment must extract perplexity when metric_name='perplexity'."""
def test_extracts_perplexity_from_stdout(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('training')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["success"] is True
assert result["metric"] == pytest.approx(61.2)
assert result["error"] is None
def test_returns_none_when_no_perplexity_logged(self, tmp_path):
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('done')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="training complete, no metrics",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["success"] is True
assert result["metric"] is None
def test_extracts_from_stderr_too(self, tmp_path):
"""Some frameworks log metrics to stderr."""
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("import sys")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="",
stderr="perplexity: 72.3",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
assert result["metric"] == pytest.approx(72.3)
# ── 4. Config override ─────────────────────────────────────────────────
class TestPerplexityConfig:
"""Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""
def test_config_accepts_perplexity(self):
try:
from config import settings
except ImportError:
pytest.skip("pydantic_settings not installed")
original = settings.autoresearch_metric
try:
settings.autoresearch_metric = "perplexity"
assert settings.autoresearch_metric == "perplexity"
finally:
settings.autoresearch_metric = original
def test_run_experiment_receives_custom_metric(self, tmp_path):
"""Verify run_experiment correctly uses 'perplexity' metric_name."""
from timmy.autoresearch import run_experiment
repo_dir = tmp_path / "autoresearch"
repo_dir.mkdir()
(repo_dir / "train.py").write_text("print('hi')")
with patch("timmy.autoresearch.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0,
stdout="perplexity: 44.2\nval_bpb: 1.05",
stderr="",
)
result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")
# Must pick perplexity, not val_bpb
assert result["metric"] == pytest.approx(44.2)
# ── 5. History with perplexity values ───────────────────────────────────
class TestPerplexityHistory:
"""Experiment history stores and retrieves perplexity values correctly."""
def test_round_trips_through_jsonl(self, tmp_path):
from timmy.autoresearch import _append_result, get_experiment_history
_append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
_append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})
history = get_experiment_history(tmp_path)
assert len(history) == 2
assert history[0]["metric"] == 61.2 # most recent first
assert history[1]["metric"] == 85.4
def test_mixed_metric_history(self, tmp_path):
"""History may contain runs with different metrics (migration case)."""
from timmy.autoresearch import _append_result, get_experiment_history
_append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
_append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})
history = get_experiment_history(tmp_path)
assert len(history) == 2
assert history[0]["metric_name"] == "perplexity"
# ── 6. Dashboard route with perplexity ──────────────────────────────────
class TestExperimentsRoutePerplexity:
"""The /experiments dashboard renders correctly with perplexity metric."""
def test_displays_perplexity_metric_name(self, client):
try:
from config import settings
except ImportError:
pytest.skip("pydantic_settings not installed")
original_metric = settings.autoresearch_metric
original_enabled = settings.autoresearch_enabled
try:
settings.autoresearch_metric = "perplexity"
settings.autoresearch_enabled = True
with patch("timmy.autoresearch.get_experiment_history", return_value=[]):
resp = client.get("/experiments")
assert resp.status_code == 200
finally:
settings.autoresearch_metric = original_metric
settings.autoresearch_enabled = original_enabled