Timmy-time-dashboard/tests/infrastructure/world/test_benchmark.py

"""Tests for the agent performance regression benchmark suite.

Covers: scenario loading, metrics collection, runner execution,
goal predicates, and result persistence.
"""

from __future__ import annotations

import pytest

from infrastructure.world.benchmark.metrics import (
    BenchmarkMetrics,
    ScenarioResult,
    compare_runs,
    load_history,
)
from infrastructure.world.benchmark.runner import BenchmarkRunner
from infrastructure.world.benchmark.scenarios import (
    BUILTIN_SCENARIOS,
    BenchmarkScenario,
    load_scenarios,
)

# ---------------------------------------------------------------------------
# Scenario definitions
# ---------------------------------------------------------------------------


class TestBenchmarkScenario:
    def test_builtin_scenarios_exist(self):
        assert len(BUILTIN_SCENARIOS) >= 5

    def test_scenario_fields(self):
        s = BUILTIN_SCENARIOS[0]
        assert s.name
        assert s.description
        assert s.start_location
        assert s.max_cycles > 0

    def test_load_all_scenarios(self):
        scenarios = load_scenarios()
        assert len(scenarios) == len(BUILTIN_SCENARIOS)

    def test_load_scenarios_by_tag(self):
        nav = load_scenarios(tags=["navigation"])
        assert len(nav) >= 2
        for s in nav:
            assert "navigation" in s.tags

    def test_load_scenarios_no_match(self):
        result = load_scenarios(tags=["nonexistent_tag"])
        assert result == []

    def test_scenario_is_frozen(self):
        s = BUILTIN_SCENARIOS[0]
        with pytest.raises(AttributeError):
            s.name = "modified"


# ---------------------------------------------------------------------------
# Goal predicates
# ---------------------------------------------------------------------------


class TestGoalPredicates:
    def test_reached_location_predicate(self):
        s = BUILTIN_SCENARIOS[0]  # Walk to Balmora
        assert s.goal_predicate is not None
        assert s.goal_predicate([], "Balmora") is True
        assert s.goal_predicate([], "Seyda Neen") is False

    def test_reached_location_case_insensitive(self):
        s = BUILTIN_SCENARIOS[0]
        assert s.goal_predicate([], "balmora") is True
        assert s.goal_predicate([], "BALMORA") is True

    def test_interacted_with_predicate(self):
        s = BUILTIN_SCENARIOS[1]  # Fargoth quest
        assert s.goal_predicate is not None
        actions = [{"action": "speak", "target": "Fargoth"}]
        assert s.goal_predicate(actions, "Seyda Neen") is True

    def test_interacted_with_no_match(self):
        s = BUILTIN_SCENARIOS[1]
        actions = [{"action": "speak", "target": "Guard"}]
        assert s.goal_predicate(actions, "Seyda Neen") is False

    def test_interacted_with_interact_action(self):
        s = BUILTIN_SCENARIOS[1]
        actions = [{"action": "interact", "target": "Fargoth"}]
        assert s.goal_predicate(actions, "Seyda Neen") is True

    def test_no_predicate_scenario(self):
        combat = [s for s in BUILTIN_SCENARIOS if "combat" in s.tags][0]
        assert combat.goal_predicate is None


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------


class TestScenarioResult:
    def test_default_values(self):
        r = ScenarioResult(scenario_name="test")
        assert r.success is False
        assert r.cycles_used == 0
        assert r.llm_calls == 0
        assert r.metabolic_cost == 0.0
        assert r.error is None


class TestBenchmarkMetrics:
    def test_empty_metrics(self):
        m = BenchmarkMetrics()
        assert m.pass_count == 0
        assert m.fail_count == 0
        assert m.success_rate == 0.0
        assert m.total_llm_calls == 0
        assert m.total_metabolic_cost == 0.0

    def test_success_rate(self):
        m = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="a", success=True),
                ScenarioResult(scenario_name="b", success=False),
                ScenarioResult(scenario_name="c", success=True),
            ]
        )
        assert m.pass_count == 2
        assert m.fail_count == 1
        assert abs(m.success_rate - 2 / 3) < 0.01

    def test_totals(self):
        m = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="a", llm_calls=10, metabolic_cost=30.0),
                ScenarioResult(scenario_name="b", llm_calls=5, metabolic_cost=15.0),
            ]
        )
        assert m.total_llm_calls == 15
        assert m.total_metabolic_cost == 45.0

    def test_save_and_load(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        m = BenchmarkMetrics(
            timestamp="2026-01-01T00:00:00",
            commit_sha="abc123",
            total_time_ms=1000,
            results=[
                ScenarioResult(
                    scenario_name="a",
                    success=True,
                    cycles_used=5,
                    max_cycles=10,
                ),
            ],
        )
        m.save(path)

        history = load_history(path)
        assert len(history) == 1
        assert history[0]["commit_sha"] == "abc123"
        assert history[0]["scenarios"][0]["scenario_name"] == "a"

    def test_save_appends(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        for i in range(3):
            m = BenchmarkMetrics(
                timestamp=f"2026-01-0{i + 1}T00:00:00",
                results=[ScenarioResult(scenario_name=f"s{i}")],
            )
            m.save(path)

        history = load_history(path)
        assert len(history) == 3
        # Most recent first
        assert history[0]["timestamp"] == "2026-01-03T00:00:00"

    def test_summary_output(self):
        m = BenchmarkMetrics(
            timestamp="2026-01-01T00:00:00",
            commit_sha="abc123",
            total_time_ms=500,
            results=[
                ScenarioResult(
                    scenario_name="Walk Test",
                    success=True,
                    cycles_used=5,
                    max_cycles=10,
                    wall_time_ms=200,
                    llm_calls=15,
                ),
            ],
        )
        summary = m.summary()
        assert "Walk Test" in summary
        assert "PASS" in summary
        assert "abc123" in summary

    def test_load_history_missing_file(self, tmp_path):
        assert load_history(tmp_path / "nope.jsonl") == []

    def test_load_history_corrupt_lines(self, tmp_path):
        path = tmp_path / "bench.jsonl"
        path.write_text('{"valid": true}\nnot json\n{"also": "valid"}\n')
        history = load_history(path)
        assert len(history) == 2


# ---------------------------------------------------------------------------
# Comparison
# ---------------------------------------------------------------------------


class TestCompareRuns:
    def test_regression_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
            ]
        )
        report = compare_runs(current, baseline)
        assert "REGRESSION" in report

    def test_improvement_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=False, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        report = compare_runs(current, baseline)
        assert "IMPROVEMENT" in report

    def test_slower_detected(self):
        baseline = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=10),
            ]
        )
        current = BenchmarkMetrics(
            results=[
                ScenarioResult(scenario_name="walk", success=True, cycles_used=20),
            ]
        )
        report = compare_runs(current, baseline)
        assert "SLOWER" in report

    def test_new_scenario_noted(self):
        baseline = BenchmarkMetrics(results=[])
        current = BenchmarkMetrics(results=[ScenarioResult(scenario_name="new_one", success=True)])
        report = compare_runs(current, baseline)
        assert "NEW" in report


# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------


class TestBenchmarkRunner:
    @pytest.mark.asyncio
    async def test_run_single_scenario(self):
        """Runner executes a scenario and returns a result."""
        scenario = BenchmarkScenario(
            name="Test Walk",
            description="Simple test",
            start_location="A",
            goal_location="A",
            max_cycles=3,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        assert len(metrics.results) == 1
        r = metrics.results[0]
        assert r.scenario_name == "Test Walk"
        assert r.cycles_used == 3  # no predicate, runs all cycles
        assert r.success is True  # no predicate = success if survived
        assert r.wall_time_ms >= 0
        assert r.llm_calls == 9  # 3 cycles * 3 calls
        assert r.metabolic_cost > 0

    @pytest.mark.asyncio
    async def test_run_with_goal_predicate(self):
        """Runner stops early when goal predicate is satisfied."""

        def always_true(actions, location):
            return True

        scenario = BenchmarkScenario(
            name="Instant Win",
            description="Predicate satisfied immediately",
            start_location="A",
            max_cycles=100,
            goal_predicate=always_true,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        r = metrics.results[0]
        assert r.success is True
        assert r.cycles_used == 1  # Stopped at first cycle

    @pytest.mark.asyncio
    async def test_run_with_failing_predicate(self):
        """Scenario fails when predicate never satisfied."""

        def never_true(actions, location):
            return False

        scenario = BenchmarkScenario(
            name="Impossible",
            description="Predicate never satisfied",
            start_location="A",
            max_cycles=5,
            goal_predicate=never_true,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        r = metrics.results[0]
        assert r.success is False
        assert r.cycles_used == 5

    @pytest.mark.asyncio
    async def test_run_multiple_scenarios(self):
        """Runner handles multiple scenarios in sequence."""
        scenarios = [
            BenchmarkScenario(
                name=f"Scenario {i}",
                description=f"Test {i}",
                start_location="A",
                max_cycles=2,
                tags=["test"],
            )
            for i in range(3)
        ]
        runner = BenchmarkRunner()
        metrics = await runner.run(scenarios)
        assert len(metrics.results) == 3
        assert metrics.total_time_ms >= 0
        assert metrics.timestamp

    @pytest.mark.asyncio
    async def test_metrics_commit_sha(self):
        """Runner captures git SHA in metrics."""
        scenario = BenchmarkScenario(
            name="SHA Test",
            description="Check SHA capture",
            start_location="A",
            max_cycles=1,
            tags=["test"],
        )
        runner = BenchmarkRunner()
        metrics = await runner.run([scenario])
        # SHA may or may not be available in test env; just ensure no crash
        assert isinstance(metrics.commit_sha, str)

    @pytest.mark.asyncio
    async def test_builtin_scenarios_run(self):
        """All built-in scenarios run without crashing."""
        # Use just 2 cycles each to keep tests fast
        scenarios = [
            BenchmarkScenario(
                name=s.name,
                description=s.description,
                start_location=s.start_location,
                goal_location=s.goal_location,
                entities=list(s.entities),
                events=list(s.events),
                max_cycles=2,  # Override for speed
                goal_predicate=None,  # Skip predicate for smoke test
                tags=list(s.tags),
            )
            for s in BUILTIN_SCENARIOS
        ]
        runner = BenchmarkRunner()
        metrics = await runner.run(scenarios)
        assert len(metrics.results) == len(BUILTIN_SCENARIOS)
        # All should succeed (no predicate + survived = pass)
        for r in metrics.results:
            assert r.success is True
            assert r.error is None