196 lines
6.4 KiB
Python
196 lines
6.4 KiB
Python
"""Benchmark metrics collection and persistence.
|
|
|
|
Tracks per-scenario results: cycles used, wall-clock time, success,
|
|
LLM call count, and estimated metabolic cost. Results are persisted
|
|
as JSONL for trend analysis and CI regression gates.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import asdict, dataclass, field
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ScenarioResult:
|
|
"""Outcome of running a single benchmark scenario.
|
|
|
|
Attributes:
|
|
scenario_name: Human-readable scenario name.
|
|
success: Whether the goal predicate was satisfied.
|
|
cycles_used: Number of heartbeat cycles executed.
|
|
max_cycles: The scenario's cycle budget.
|
|
wall_time_ms: Total wall-clock time in milliseconds.
|
|
llm_calls: Number of LLM inference calls made.
|
|
metabolic_cost: Estimated resource cost (arbitrary unit, ≈ tokens).
|
|
error: Error message if the run crashed.
|
|
tags: Scenario tags (copied for filtering).
|
|
"""
|
|
|
|
scenario_name: str
|
|
success: bool = False
|
|
cycles_used: int = 0
|
|
max_cycles: int = 0
|
|
wall_time_ms: int = 0
|
|
llm_calls: int = 0
|
|
metabolic_cost: float = 0.0
|
|
error: str | None = None
|
|
tags: list[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class BenchmarkMetrics:
|
|
"""Aggregated metrics across all scenarios in a benchmark run.
|
|
|
|
Attributes:
|
|
results: Per-scenario results.
|
|
total_time_ms: Total wall-clock time for the full suite.
|
|
timestamp: ISO-8601 timestamp of the run.
|
|
commit_sha: Git commit SHA (if available).
|
|
"""
|
|
|
|
results: list[ScenarioResult] = field(default_factory=list)
|
|
total_time_ms: int = 0
|
|
timestamp: str = ""
|
|
commit_sha: str = ""
|
|
|
|
# -- derived properties ------------------------------------------------
|
|
|
|
@property
|
|
def pass_count(self) -> int:
|
|
return sum(1 for r in self.results if r.success)
|
|
|
|
@property
|
|
def fail_count(self) -> int:
|
|
return sum(1 for r in self.results if not r.success)
|
|
|
|
@property
|
|
def success_rate(self) -> float:
|
|
if not self.results:
|
|
return 0.0
|
|
return self.pass_count / len(self.results)
|
|
|
|
@property
|
|
def total_llm_calls(self) -> int:
|
|
return sum(r.llm_calls for r in self.results)
|
|
|
|
@property
|
|
def total_metabolic_cost(self) -> float:
|
|
return sum(r.metabolic_cost for r in self.results)
|
|
|
|
# -- persistence -------------------------------------------------------
|
|
|
|
def save(self, path: Path) -> None:
|
|
"""Append this run's results to a JSONL file at *path*."""
|
|
path = Path(path)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
record = {
|
|
"timestamp": self.timestamp,
|
|
"commit_sha": self.commit_sha,
|
|
"total_time_ms": self.total_time_ms,
|
|
"success_rate": round(self.success_rate, 4),
|
|
"total_llm_calls": self.total_llm_calls,
|
|
"total_metabolic_cost": round(self.total_metabolic_cost, 2),
|
|
"scenarios": [asdict(r) for r in self.results],
|
|
}
|
|
with path.open("a") as f:
|
|
f.write(json.dumps(record) + "\n")
|
|
logger.info("Benchmark results saved to %s", path)
|
|
|
|
# -- summary -----------------------------------------------------------
|
|
|
|
def summary(self) -> str:
|
|
"""Return a human-readable summary of the benchmark run."""
|
|
lines = [
|
|
"=== Benchmark Summary ===",
|
|
f"Scenarios: {len(self.results)} "
|
|
f"Passed: {self.pass_count} "
|
|
f"Failed: {self.fail_count} "
|
|
f"Success rate: {self.success_rate:.0%}",
|
|
f"Total time: {self.total_time_ms} ms "
|
|
f"LLM calls: {self.total_llm_calls} "
|
|
f"Metabolic cost: {self.total_metabolic_cost:.1f}",
|
|
]
|
|
if self.commit_sha:
|
|
lines.append(f"Commit: {self.commit_sha}")
|
|
lines.append("")
|
|
for r in self.results:
|
|
status = "PASS" if r.success else "FAIL"
|
|
lines.append(
|
|
f" [{status}] {r.scenario_name} — "
|
|
f"{r.cycles_used}/{r.max_cycles} cycles, "
|
|
f"{r.wall_time_ms} ms, "
|
|
f"{r.llm_calls} LLM calls"
|
|
)
|
|
if r.error:
|
|
lines.append(f" Error: {r.error}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def load_history(path: Path) -> list[dict]:
|
|
"""Load benchmark history from a JSONL file.
|
|
|
|
Returns:
|
|
List of run records, most recent first.
|
|
"""
|
|
path = Path(path)
|
|
if not path.exists():
|
|
return []
|
|
records: list[dict] = []
|
|
for line in path.read_text().strip().splitlines():
|
|
try:
|
|
records.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
continue
|
|
return list(reversed(records))
|
|
|
|
|
|
def compare_runs(
|
|
current: BenchmarkMetrics,
|
|
baseline: BenchmarkMetrics,
|
|
) -> str:
|
|
"""Compare two benchmark runs and report regressions.
|
|
|
|
Returns:
|
|
Human-readable comparison report.
|
|
"""
|
|
lines = ["=== Regression Report ==="]
|
|
|
|
# Overall
|
|
rate_delta = current.success_rate - baseline.success_rate
|
|
lines.append(
|
|
f"Success rate: {baseline.success_rate:.0%} -> {current.success_rate:.0%} "
|
|
f"({rate_delta:+.0%})"
|
|
)
|
|
|
|
cost_delta = current.total_metabolic_cost - baseline.total_metabolic_cost
|
|
if baseline.total_metabolic_cost > 0:
|
|
cost_pct = (cost_delta / baseline.total_metabolic_cost) * 100
|
|
lines.append(
|
|
f"Metabolic cost: {baseline.total_metabolic_cost:.1f} -> "
|
|
f"{current.total_metabolic_cost:.1f} ({cost_pct:+.1f}%)"
|
|
)
|
|
|
|
# Per-scenario
|
|
baseline_map = {r.scenario_name: r for r in baseline.results}
|
|
for r in current.results:
|
|
b = baseline_map.get(r.scenario_name)
|
|
if b is None:
|
|
lines.append(f" [NEW] {r.scenario_name}")
|
|
continue
|
|
if b.success and not r.success:
|
|
lines.append(f" [REGRESSION] {r.scenario_name} — was PASS, now FAIL")
|
|
elif not b.success and r.success:
|
|
lines.append(f" [IMPROVEMENT] {r.scenario_name} — was FAIL, now PASS")
|
|
elif r.cycles_used > b.cycles_used * 1.5:
|
|
lines.append(
|
|
f" [SLOWER] {r.scenario_name} — "
|
|
f"{b.cycles_used} -> {r.cycles_used} cycles (+{r.cycles_used - b.cycles_used})"
|
|
)
|
|
|
|
return "\n".join(lines)
|