"""Benchmark metrics collection and persistence. Tracks per-scenario results: cycles used, wall-clock time, success, LLM call count, and estimated metabolic cost. Results are persisted as JSONL for trend analysis and CI regression gates. """ from __future__ import annotations import json import logging from dataclasses import asdict, dataclass, field from pathlib import Path logger = logging.getLogger(__name__) @dataclass class ScenarioResult: """Outcome of running a single benchmark scenario. Attributes: scenario_name: Human-readable scenario name. success: Whether the goal predicate was satisfied. cycles_used: Number of heartbeat cycles executed. max_cycles: The scenario's cycle budget. wall_time_ms: Total wall-clock time in milliseconds. llm_calls: Number of LLM inference calls made. metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens). error: Error message if the run crashed. tags: Scenario tags (copied for filtering). """ scenario_name: str success: bool = False cycles_used: int = 0 max_cycles: int = 0 wall_time_ms: int = 0 llm_calls: int = 0 metabolic_cost: float = 0.0 error: str | None = None tags: list[str] = field(default_factory=list) @dataclass class BenchmarkMetrics: """Aggregated metrics across all scenarios in a benchmark run. Attributes: results: Per-scenario results. total_time_ms: Total wall-clock time for the full suite. timestamp: ISO-8601 timestamp of the run. commit_sha: Git commit SHA (if available). """ results: list[ScenarioResult] = field(default_factory=list) total_time_ms: int = 0 timestamp: str = "" commit_sha: str = "" # -- derived properties ------------------------------------------------ @property def pass_count(self) -> int: return sum(1 for r in self.results if r.success) @property def fail_count(self) -> int: return sum(1 for r in self.results if not r.success) @property def success_rate(self) -> float: if not self.results: return 0.0 return self.pass_count / len(self.results) @property def total_llm_calls(self) -> int: return sum(r.llm_calls for r in self.results) @property def total_metabolic_cost(self) -> float: return sum(r.metabolic_cost for r in self.results) # -- persistence ------------------------------------------------------- def save(self, path: Path) -> None: """Append this run's results to a JSONL file at *path*.""" path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) record = { "timestamp": self.timestamp, "commit_sha": self.commit_sha, "total_time_ms": self.total_time_ms, "success_rate": round(self.success_rate, 4), "total_llm_calls": self.total_llm_calls, "total_metabolic_cost": round(self.total_metabolic_cost, 2), "scenarios": [asdict(r) for r in self.results], } with path.open("a") as f: f.write(json.dumps(record) + "\n") logger.info("Benchmark results saved to %s", path) # -- summary ----------------------------------------------------------- def summary(self) -> str: """Return a human-readable summary of the benchmark run.""" lines = [ "=== Benchmark Summary ===", f"Scenarios: {len(self.results)} " f"Passed: {self.pass_count} " f"Failed: {self.fail_count} " f"Success rate: {self.success_rate:.0%}", f"Total time: {self.total_time_ms} ms " f"LLM calls: {self.total_llm_calls} " f"Metabolic cost: {self.total_metabolic_cost:.1f}", ] if self.commit_sha: lines.append(f"Commit: {self.commit_sha}") lines.append("") for r in self.results: status = "PASS" if r.success else "FAIL" lines.append( f" [{status}] {r.scenario_name} — " f"{r.cycles_used}/{r.max_cycles} cycles, " f"{r.wall_time_ms} ms, " f"{r.llm_calls} LLM calls" ) if r.error: lines.append(f" Error: {r.error}") return "\n".join(lines) def load_history(path: Path) -> list[dict]: """Load benchmark history from a JSONL file. Returns: List of run records, most recent first. """ path = Path(path) if not path.exists(): return [] records: list[dict] = [] for line in path.read_text().strip().splitlines(): try: records.append(json.loads(line)) except json.JSONDecodeError: continue return list(reversed(records)) def compare_runs( current: BenchmarkMetrics, baseline: BenchmarkMetrics, ) -> str: """Compare two benchmark runs and report regressions. Returns: Human-readable comparison report. """ lines = ["=== Regression Report ==="] # Overall rate_delta = current.success_rate - baseline.success_rate lines.append( f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} " f"({rate_delta:+.0%})" ) cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost if baseline.total_metabolic_cost > 0: cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100 lines.append( f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> " f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)" ) # Per-scenario baseline_map = {r.scenario_name: r for r in baseline.results} for r in current.results: b = baseline_map.get(r.scenario_name) if b is None: lines.append(f" [NEW] {r.scenario_name}") continue if b.success and not r.success: lines.append(f" [REGRESSION] {r.scenario_name} — was PASS, now FAIL") elif not b.success and r.success: lines.append(f" [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS") elif r.cycles_used > b.cycles_used * 1.5: lines.append( f" [SLOWER] {r.scenario_name} — " f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})" ) return "\n".join(lines)