tests/timmy/test_autoresearch_perplexity.py

"""Tests for using the autoresearch module with perplexity as the target metric.

Covers run integration, config override, history, and dashboard rendering when
`autoresearch_metric` is set to ``perplexity``.

Note: metric extraction and evaluation logic are already tested in
test_autoresearch.py — only perplexity-specific integration paths are tested here.
"""

from unittest.mock import MagicMock, patch

import pytest

# ── run_experiment with perplexity ──────────────────────────────────────


class TestRunExperimentPerplexity:
    """run_experiment must extract perplexity when metric_name='perplexity'."""

    def test_extracts_perplexity_from_stdout(self, tmp_path):
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('training')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["success"] is True
        assert result["metric"] == pytest.approx(61.2)
        assert result["error"] is None

    def test_returns_none_when_no_perplexity_logged(self, tmp_path):
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('done')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="training complete, no metrics",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["success"] is True
        assert result["metric"] is None

    def test_extracts_from_stderr_too(self, tmp_path):
        """Some frameworks log metrics to stderr."""
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("import sys")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="",
                stderr="perplexity: 72.3",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        assert result["metric"] == pytest.approx(72.3)


# ── 4. Config override ─────────────────────────────────────────────────


class TestPerplexityConfig:
    """Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""

    def test_config_accepts_perplexity(self):
        try:
            from config import settings
        except ImportError:
            pytest.skip("pydantic_settings not installed")

        original = settings.autoresearch_metric
        try:
            settings.autoresearch_metric = "perplexity"
            assert settings.autoresearch_metric == "perplexity"
        finally:
            settings.autoresearch_metric = original

    def test_run_experiment_receives_custom_metric(self, tmp_path):
        """Verify run_experiment correctly uses 'perplexity' metric_name."""
        from timmy.autoresearch import run_experiment

        repo_dir = tmp_path / "autoresearch"
        repo_dir.mkdir()
        (repo_dir / "train.py").write_text("print('hi')")

        with patch("timmy.autoresearch.subprocess.run") as mock_run:
            mock_run.return_value = MagicMock(
                returncode=0,
                stdout="perplexity: 44.2\nval_bpb: 1.05",
                stderr="",
            )
            result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")

        # Must pick perplexity, not val_bpb
        assert result["metric"] == pytest.approx(44.2)


# ── 5. History with perplexity values ───────────────────────────────────


class TestPerplexityHistory:
    """Experiment history stores and retrieves perplexity values correctly."""

    def test_round_trips_through_jsonl(self, tmp_path):
        from timmy.autoresearch import _append_result, get_experiment_history

        _append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})
        _append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})

        history = get_experiment_history(tmp_path)
        assert len(history) == 2
        assert history[0]["metric"] == 61.2  # most recent first
        assert history[1]["metric"] == 85.4

    def test_mixed_metric_history(self, tmp_path):
        """History may contain runs with different metrics (migration case)."""
        from timmy.autoresearch import _append_result, get_experiment_history

        _append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})
        _append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})

        history = get_experiment_history(tmp_path)
        assert len(history) == 2
        assert history[0]["metric_name"] == "perplexity"


# ── 6. Dashboard route with perplexity ──────────────────────────────────


class TestExperimentsRoutePerplexity:
    """The /experiments dashboard renders correctly with perplexity metric."""

    def test_displays_perplexity_metric_name(self, client):
        try:
            from config import settings
        except ImportError:
            pytest.skip("pydantic_settings not installed")

        original_metric = settings.autoresearch_metric
        original_enabled = settings.autoresearch_enabled
        try:
            settings.autoresearch_metric = "perplexity"
            settings.autoresearch_enabled = True

            with patch("timmy.autoresearch.get_experiment_history", return_value=[]):
                resp = client.get("/experiments")

            assert resp.status_code == 200
        finally:
            settings.autoresearch_metric = original_metric
            settings.autoresearch_enabled = original_enabled
Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`"""Tests for using the autoresearch module with perplexity as the target metric.`
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00
Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`Covers run integration, config override, history, and dashboard rendering when`
			`autoresearch_metric` is set to ``perplexity``.
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00
Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`Note: metric extraction and evaluation logic are already tested in`
			`test_autoresearch.py — only perplexity-specific integration paths are tested here.`
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00			`"""`

			`from unittest.mock import MagicMock, patch`

			`import pytest`

Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`# ── run_experiment with perplexity ──────────────────────────────────────`
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00

			`class TestRunExperimentPerplexity:`
			`"""run_experiment must extract perplexity when metric_name='perplexity'."""`

			`def test_extracts_perplexity_from_stdout(self, tmp_path):`
			`from timmy.autoresearch import run_experiment`

			`repo_dir = tmp_path / "autoresearch"`
			`repo_dir.mkdir()`
			`(repo_dir / "train.py").write_text("print('training')")`

			`with patch("timmy.autoresearch.subprocess.run") as mock_run:`
			`mock_run.return_value = MagicMock(`
			`returncode=0,`
			`stdout="step 100 perplexity: 85.4\nstep 200 perplexity: 61.2",`
			`stderr="",`
			`)`
			`result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")`

			`assert result["success"] is True`
			`assert result["metric"] == pytest.approx(61.2)`
			`assert result["error"] is None`

			`def test_returns_none_when_no_perplexity_logged(self, tmp_path):`
			`from timmy.autoresearch import run_experiment`

			`repo_dir = tmp_path / "autoresearch"`
			`repo_dir.mkdir()`
			`(repo_dir / "train.py").write_text("print('done')")`

			`with patch("timmy.autoresearch.subprocess.run") as mock_run:`
			`mock_run.return_value = MagicMock(`
			`returncode=0,`
			`stdout="training complete, no metrics",`
			`stderr="",`
			`)`
			`result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")`

			`assert result["success"] is True`
			`assert result["metric"] is None`

			`def test_extracts_from_stderr_too(self, tmp_path):`
			`"""Some frameworks log metrics to stderr."""`
			`from timmy.autoresearch import run_experiment`

			`repo_dir = tmp_path / "autoresearch"`
			`repo_dir.mkdir()`
			`(repo_dir / "train.py").write_text("import sys")`

			`with patch("timmy.autoresearch.subprocess.run") as mock_run:`
			`mock_run.return_value = MagicMock(`
			`returncode=0,`
			`stdout="",`
			`stderr="perplexity: 72.3",`
			`)`
			`result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")`

			`assert result["metric"] == pytest.approx(72.3)`


			`# ── 4. Config override ─────────────────────────────────────────────────`


			`class TestPerplexityConfig:`
			`"""Settings plumbing: AUTORESEARCH_METRIC=perplexity reaches every call."""`

			`def test_config_accepts_perplexity(self):`
			`try:`
			`from config import settings`
			`except ImportError:`
			`pytest.skip("pydantic_settings not installed")`

			`original = settings.autoresearch_metric`
			`try:`
			`settings.autoresearch_metric = "perplexity"`
			`assert settings.autoresearch_metric == "perplexity"`
			`finally:`
			`settings.autoresearch_metric = original`

			`def test_run_experiment_receives_custom_metric(self, tmp_path):`
			`"""Verify run_experiment correctly uses 'perplexity' metric_name."""`
			`from timmy.autoresearch import run_experiment`

			`repo_dir = tmp_path / "autoresearch"`
			`repo_dir.mkdir()`
			`(repo_dir / "train.py").write_text("print('hi')")`

			`with patch("timmy.autoresearch.subprocess.run") as mock_run:`
			`mock_run.return_value = MagicMock(`
			`returncode=0,`
			`stdout="perplexity: 44.2\nval_bpb: 1.05",`
			`stderr="",`
			`)`
			`result = run_experiment(tmp_path, timeout=60, metric_name="perplexity")`

			`# Must pick perplexity, not val_bpb`
			`assert result["metric"] == pytest.approx(44.2)`


			`# ── 5. History with perplexity values ───────────────────────────────────`


			`class TestPerplexityHistory:`
			`"""Experiment history stores and retrieves perplexity values correctly."""`

			`def test_round_trips_through_jsonl(self, tmp_path):`
			`from timmy.autoresearch import _append_result, get_experiment_history`

			`_append_result(tmp_path, {"metric": 85.4, "success": True, "metric_name": "perplexity"})`
			`_append_result(tmp_path, {"metric": 61.2, "success": True, "metric_name": "perplexity"})`

			`history = get_experiment_history(tmp_path)`
			`assert len(history) == 2`
			`assert history[0]["metric"] == 61.2 # most recent first`
			`assert history[1]["metric"] == 85.4`

			`def test_mixed_metric_history(self, tmp_path):`
			`"""History may contain runs with different metrics (migration case)."""`
			`from timmy.autoresearch import _append_result, get_experiment_history`

			`_append_result(tmp_path, {"metric": 1.12, "metric_name": "val_bpb"})`
			`_append_result(tmp_path, {"metric": 55.0, "metric_name": "perplexity"})`

			`history = get_experiment_history(tmp_path)`
			`assert len(history) == 2`
			`assert history[0]["metric_name"] == "perplexity"`


			`# ── 6. Dashboard route with perplexity ──────────────────────────────────`


			`class TestExperimentsRoutePerplexity:`
			`"""The /experiments dashboard renders correctly with perplexity metric."""`

			`def test_displays_perplexity_metric_name(self, client):`
			`try:`
			`from config import settings`
			`except ImportError:`
			`pytest.skip("pydantic_settings not installed")`

Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`original_metric = settings.autoresearch_metric`
			`original_enabled = settings.autoresearch_enabled`
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00			`try:`
			`settings.autoresearch_metric = "perplexity"`
			`settings.autoresearch_enabled = True`

fix: remove invalid show_tool_calls kwarg crashing Agent init (#157) * fix: remove invalid show_tool_calls kwarg crashing Agent init (regression) show_tool_calls was removed in f95c960 (Feb 26) because agno 2.5.x doesn't accept it, then reintroduced in fd0ede0 (Mar 8) without runtime testing — mocked tests hid the breakage. Replace the bogus assertion with a regression guard and an allowlist test that catches unknown kwargs before they reach production. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: auto-install git hooks, add black/isort to dev deps - Add .githooks/ with portable pre-commit hook (macOS + Linux) - make install now auto-activates hooks via core.hooksPath - Add black and isort to poetry dev group (were only in CI via raw pip) - Fix black formatting on 2 files flagged by CI - Fix test_autoresearch_perplexity patching wrong module path Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-09 15:01:00 -04:00			`with patch("timmy.autoresearch.get_experiment_history", return_value=[]):`
Add test plan for autoresearch with perplexity metric (#154) 2026-03-09 09:36:26 -04:00			`resp = client.get("/experiments")`

			`assert resp.status_code == 200`
			`finally:`
Claude/angry cerf (#173) * feat: set qwen3.5:latest as default model - Make qwen3.5:latest the primary default model for faster inference - Move llama3.1:8b-instruct to fallback chain - Update text fallback chain to prioritize qwen3.5:latest Retains full backward compatibility via cascade fallback. * test: remove ~55 brittle, duplicate, and useless tests Audit of all 100 test files identified tests that provided no real regression protection. Removed: - 4 files deleted entirely: test_setup_script (always skipped), test_csrf_bypass (tautological assertions), test_input_validation (accepts 200-500 status codes), test_security_regression (fragile source-pattern checks redundant with rendering tests) - Duplicate test classes (TestToolTracking, TestCalculatorExtended) - Mock-only tests that just verify mock wiring, not behavior - Structurally broken tests (TestCreateToolFunctions patches after import) - Empty/pass-body tests and meaningless assertions (len > 20) - Flaky subprocess tests (aider tool calling real binary) All 1328 remaining tests pass. Net: -699 lines, zero coverage loss. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: prevent test pollution from autoresearch_enabled mutation test_autoresearch_perplexity.py was setting settings.autoresearch_enabled = True but never restoring it in the finally block — polluting subsequent tests. When pytest-randomly ordered it before test_experiments_page_shows_disabled_when_off, the victim test saw enabled=True and failed to find "Disabled" in the page. Fix both sides: - Restore autoresearch_enabled in the finally block (root cause) - Mock settings explicitly in the victim test (defense in depth) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 16:55:27 -04:00			`settings.autoresearch_metric = original_metric`
			`settings.autoresearch_enabled = original_enabled`