Timmy-time-dashboard/src/infrastructure/world/benchmark/metrics.py

"""Benchmark metrics collection and persistence.

Tracks per-scenario results: cycles used, wall-clock time, success,
LLM call count, and estimated metabolic cost.  Results are persisted
as JSONL for trend analysis and CI regression gates.
"""

from __future__ import annotations

import json
import logging
from dataclasses import asdict, dataclass, field
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass
class ScenarioResult:
    """Outcome of running a single benchmark scenario.

    Attributes:
        scenario_name:  Human-readable scenario name.
        success:        Whether the goal predicate was satisfied.
        cycles_used:    Number of heartbeat cycles executed.
        max_cycles:     The scenario's cycle budget.
        wall_time_ms:   Total wall-clock time in milliseconds.
        llm_calls:      Number of LLM inference calls made.
        metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens).
        error:          Error message if the run crashed.
        tags:           Scenario tags (copied for filtering).
    """

    scenario_name: str
    success: bool = False
    cycles_used: int = 0
    max_cycles: int = 0
    wall_time_ms: int = 0
    llm_calls: int = 0
    metabolic_cost: float = 0.0
    error: str | None = None
    tags: list[str] = field(default_factory=list)


@dataclass
class BenchmarkMetrics:
    """Aggregated metrics across all scenarios in a benchmark run.

    Attributes:
        results:       Per-scenario results.
        total_time_ms: Total wall-clock time for the full suite.
        timestamp:     ISO-8601 timestamp of the run.
        commit_sha:    Git commit SHA (if available).
    """

    results: list[ScenarioResult] = field(default_factory=list)
    total_time_ms: int = 0
    timestamp: str = ""
    commit_sha: str = ""

    # -- derived properties ------------------------------------------------

    @property
    def pass_count(self) -> int:
        return sum(1 for r in self.results if r.success)

    @property
    def fail_count(self) -> int:
        return sum(1 for r in self.results if not r.success)

    @property
    def success_rate(self) -> float:
        if not self.results:
            return 0.0
        return self.pass_count / len(self.results)

    @property
    def total_llm_calls(self) -> int:
        return sum(r.llm_calls for r in self.results)

    @property
    def total_metabolic_cost(self) -> float:
        return sum(r.metabolic_cost for r in self.results)

    # -- persistence -------------------------------------------------------

    def save(self, path: Path) -> None:
        """Append this run's results to a JSONL file at *path*."""
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        record = {
            "timestamp": self.timestamp,
            "commit_sha": self.commit_sha,
            "total_time_ms": self.total_time_ms,
            "success_rate": round(self.success_rate, 4),
            "total_llm_calls": self.total_llm_calls,
            "total_metabolic_cost": round(self.total_metabolic_cost, 2),
            "scenarios": [asdict(r) for r in self.results],
        }
        with path.open("a") as f:
            f.write(json.dumps(record) + "\n")
        logger.info("Benchmark results saved to %s", path)

    # -- summary -----------------------------------------------------------

    def summary(self) -> str:
        """Return a human-readable summary of the benchmark run."""
        lines = [
            "=== Benchmark Summary ===",
            f"Scenarios: {len(self.results)}  "
            f"Passed: {self.pass_count}  "
            f"Failed: {self.fail_count}  "
            f"Success rate: {self.success_rate:.0%}",
            f"Total time: {self.total_time_ms} ms  "
            f"LLM calls: {self.total_llm_calls}  "
            f"Metabolic cost: {self.total_metabolic_cost:.1f}",
        ]
        if self.commit_sha:
            lines.append(f"Commit: {self.commit_sha}")
        lines.append("")
        for r in self.results:
            status = "PASS" if r.success else "FAIL"
            lines.append(
                f"  [{status}] {r.scenario_name} — "
                f"{r.cycles_used}/{r.max_cycles} cycles, "
                f"{r.wall_time_ms} ms, "
                f"{r.llm_calls} LLM calls"
            )
            if r.error:
                lines.append(f"         Error: {r.error}")
        return "\n".join(lines)


def load_history(path: Path) -> list[dict]:
    """Load benchmark history from a JSONL file.

    Returns:
        List of run records, most recent first.
    """
    path = Path(path)
    if not path.exists():
        return []
    records: list[dict] = []
    for line in path.read_text().strip().splitlines():
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError:
            continue
    return list(reversed(records))


def compare_runs(
    current: BenchmarkMetrics,
    baseline: BenchmarkMetrics,
) -> str:
    """Compare two benchmark runs and report regressions.

    Returns:
        Human-readable comparison report.
    """
    lines = ["=== Regression Report ==="]

    # Overall
    rate_delta = current.success_rate - baseline.success_rate
    lines.append(
        f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} "
        f"({rate_delta:+.0%})"
    )

    cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost
    if baseline.total_metabolic_cost > 0:
        cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100
        lines.append(
            f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> "
            f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)"
        )

    # Per-scenario
    baseline_map = {r.scenario_name: r for r in baseline.results}
    for r in current.results:
        b = baseline_map.get(r.scenario_name)
        if b is None:
            lines.append(f"  [NEW] {r.scenario_name}")
            continue
        if b.success and not r.success:
            lines.append(f"  [REGRESSION] {r.scenario_name} — was PASS, now FAIL")
        elif not b.success and r.success:
            lines.append(f"  [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS")
        elif r.cycles_used > b.cycles_used * 1.5:
            lines.append(
                f"  [SLOWER] {r.scenario_name} — "
                f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})"
            )

    return "\n".join(lines)