Co-authored-by: Perplexity Computer <perplexity@tower.local> Co-committed-by: Perplexity Computer <perplexity@tower.local>
342 lines
11 KiB
Python
342 lines
11 KiB
Python
"""Graduation Test — Falsework Removal Criteria.
|
|
|
|
Evaluates whether the agent meets all five graduation conditions
|
|
simultaneously. All conditions must be met within a single 24-hour
|
|
period for the system to be considered sovereign.
|
|
|
|
Conditions:
|
|
1. Perception Independence — 1 hour with no VLM calls after minute 15
|
|
2. Decision Independence — Full session with <5 cloud API calls
|
|
3. Narration Independence — All narration from local templates + local LLM
|
|
4. Economic Independence — sats_earned > sats_spent
|
|
5. Operational Independence — 24 hours unattended, no human intervention
|
|
|
|
Each condition returns a :class:`GraduationResult` with pass/fail,
|
|
the actual measured value, and the target.
|
|
|
|
"The arch must hold after the falsework is removed."
|
|
|
|
Refs: #953 (The Sovereignty Loop — Graduation Test)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── Data classes ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class ConditionResult:
|
|
"""Result of a single graduation condition evaluation."""
|
|
|
|
name: str
|
|
passed: bool
|
|
actual: float | int
|
|
target: float | int
|
|
unit: str = ""
|
|
detail: str = ""
|
|
|
|
|
|
@dataclass
|
|
class GraduationReport:
|
|
"""Full graduation test report."""
|
|
|
|
timestamp: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
|
|
all_passed: bool = False
|
|
conditions: list[ConditionResult] = field(default_factory=list)
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
"""Serialize to a JSON-safe dict."""
|
|
return {
|
|
"timestamp": self.timestamp,
|
|
"all_passed": self.all_passed,
|
|
"conditions": [asdict(c) for c in self.conditions],
|
|
"metadata": self.metadata,
|
|
}
|
|
|
|
def to_markdown(self) -> str:
|
|
"""Render the report as a markdown string."""
|
|
status = "PASSED ✓" if self.all_passed else "NOT YET"
|
|
lines = [
|
|
"# Graduation Test Report",
|
|
"",
|
|
f"**Status:** {status}",
|
|
f"**Evaluated:** {self.timestamp}",
|
|
"",
|
|
"| # | Condition | Target | Actual | Result |",
|
|
"|---|-----------|--------|--------|--------|",
|
|
]
|
|
for i, c in enumerate(self.conditions, 1):
|
|
result_str = "PASS" if c.passed else "FAIL"
|
|
actual_str = f"{c.actual}{c.unit}" if c.unit else str(c.actual)
|
|
target_str = f"{c.target}{c.unit}" if c.unit else str(c.target)
|
|
lines.append(f"| {i} | {c.name} | {target_str} | {actual_str} | {result_str} |")
|
|
|
|
lines.append("")
|
|
for c in self.conditions:
|
|
if c.detail:
|
|
lines.append(f"- **{c.name}**: {c.detail}")
|
|
|
|
lines.append("")
|
|
lines.append('> "The arch must hold after the falsework is removed."')
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ── Evaluation functions ──────────────────────────────────────────────────────
|
|
|
|
|
|
def evaluate_perception_independence(
|
|
time_window_seconds: float = 3600.0,
|
|
warmup_seconds: float = 900.0,
|
|
) -> ConditionResult:
|
|
"""Test 1: No VLM calls after the first 15 minutes of a 1-hour window.
|
|
|
|
Parameters
|
|
----------
|
|
time_window_seconds:
|
|
Total window to evaluate (default: 1 hour).
|
|
warmup_seconds:
|
|
Initial warmup period where VLM calls are expected (default: 15 min).
|
|
"""
|
|
from timmy.sovereignty.metrics import get_metrics_store
|
|
|
|
store = get_metrics_store()
|
|
|
|
# Count VLM calls in the post-warmup period
|
|
# We query all events in the window, then filter by timestamp
|
|
try:
|
|
from contextlib import closing
|
|
|
|
from timmy.sovereignty.metrics import _seconds_ago_iso
|
|
|
|
cutoff_total = _seconds_ago_iso(time_window_seconds)
|
|
cutoff_warmup = _seconds_ago_iso(time_window_seconds - warmup_seconds)
|
|
|
|
with closing(store._connect()) as conn:
|
|
vlm_calls_after_warmup = conn.execute(
|
|
"SELECT COUNT(*) FROM events WHERE event_type = 'perception_vlm_call' "
|
|
"AND timestamp >= ? AND timestamp < ?",
|
|
(cutoff_total, cutoff_warmup),
|
|
).fetchone()[0]
|
|
except Exception as exc:
|
|
logger.warning("Failed to evaluate perception independence: %s", exc)
|
|
vlm_calls_after_warmup = -1
|
|
|
|
passed = vlm_calls_after_warmup == 0
|
|
return ConditionResult(
|
|
name="Perception Independence",
|
|
passed=passed,
|
|
actual=vlm_calls_after_warmup,
|
|
target=0,
|
|
unit=" VLM calls",
|
|
detail=f"VLM calls in last {int((time_window_seconds - warmup_seconds) / 60)} min: {vlm_calls_after_warmup}",
|
|
)
|
|
|
|
|
|
def evaluate_decision_independence(
|
|
max_api_calls: int = 5,
|
|
) -> ConditionResult:
|
|
"""Test 2: Full session with <5 cloud API calls total.
|
|
|
|
Counts ``decision_llm_call`` events in the current session.
|
|
"""
|
|
from timmy.sovereignty.metrics import get_metrics_store
|
|
|
|
store = get_metrics_store()
|
|
|
|
try:
|
|
from contextlib import closing
|
|
|
|
with closing(store._connect()) as conn:
|
|
# Count LLM calls in the last 24 hours
|
|
from timmy.sovereignty.metrics import _seconds_ago_iso
|
|
|
|
cutoff = _seconds_ago_iso(86400.0)
|
|
api_calls = conn.execute(
|
|
"SELECT COUNT(*) FROM events WHERE event_type IN "
|
|
"('decision_llm_call', 'api_call') AND timestamp >= ?",
|
|
(cutoff,),
|
|
).fetchone()[0]
|
|
except Exception as exc:
|
|
logger.warning("Failed to evaluate decision independence: %s", exc)
|
|
api_calls = -1
|
|
|
|
passed = 0 <= api_calls < max_api_calls
|
|
return ConditionResult(
|
|
name="Decision Independence",
|
|
passed=passed,
|
|
actual=api_calls,
|
|
target=max_api_calls,
|
|
unit=" calls",
|
|
detail=f"Cloud API calls in last 24h: {api_calls} (target: <{max_api_calls})",
|
|
)
|
|
|
|
|
|
def evaluate_narration_independence() -> ConditionResult:
|
|
"""Test 3: All narration from local templates + local LLM (zero cloud calls).
|
|
|
|
Checks that ``narration_llm`` events are zero in the last 24 hours
|
|
while ``narration_template`` events are non-zero.
|
|
"""
|
|
from timmy.sovereignty.metrics import get_metrics_store
|
|
|
|
store = get_metrics_store()
|
|
|
|
try:
|
|
from contextlib import closing
|
|
|
|
from timmy.sovereignty.metrics import _seconds_ago_iso
|
|
|
|
cutoff = _seconds_ago_iso(86400.0)
|
|
|
|
with closing(store._connect()) as conn:
|
|
cloud_narrations = conn.execute(
|
|
"SELECT COUNT(*) FROM events WHERE event_type = 'narration_llm' AND timestamp >= ?",
|
|
(cutoff,),
|
|
).fetchone()[0]
|
|
local_narrations = conn.execute(
|
|
"SELECT COUNT(*) FROM events WHERE event_type = 'narration_template' "
|
|
"AND timestamp >= ?",
|
|
(cutoff,),
|
|
).fetchone()[0]
|
|
except Exception as exc:
|
|
logger.warning("Failed to evaluate narration independence: %s", exc)
|
|
cloud_narrations = -1
|
|
local_narrations = 0
|
|
|
|
passed = cloud_narrations == 0 and local_narrations > 0
|
|
return ConditionResult(
|
|
name="Narration Independence",
|
|
passed=passed,
|
|
actual=cloud_narrations,
|
|
target=0,
|
|
unit=" cloud calls",
|
|
detail=f"Cloud narration calls: {cloud_narrations}, local: {local_narrations}",
|
|
)
|
|
|
|
|
|
def evaluate_economic_independence(
|
|
sats_earned: float = 0.0,
|
|
sats_spent: float = 0.0,
|
|
) -> ConditionResult:
|
|
"""Test 4: sats_earned > sats_spent.
|
|
|
|
Parameters are passed in because sat tracking may live in a separate
|
|
ledger (Lightning, #851).
|
|
"""
|
|
passed = sats_earned > sats_spent and sats_earned > 0
|
|
net = sats_earned - sats_spent
|
|
return ConditionResult(
|
|
name="Economic Independence",
|
|
passed=passed,
|
|
actual=net,
|
|
target=0,
|
|
unit=" sats net",
|
|
detail=f"Earned: {sats_earned} sats, spent: {sats_spent} sats, net: {net}",
|
|
)
|
|
|
|
|
|
def evaluate_operational_independence(
|
|
uptime_hours: float = 0.0,
|
|
target_hours: float = 23.5,
|
|
human_interventions: int = 0,
|
|
) -> ConditionResult:
|
|
"""Test 5: 24 hours unattended, no human intervention.
|
|
|
|
Uptime and intervention count are passed in from the heartbeat
|
|
system (#872).
|
|
"""
|
|
passed = uptime_hours >= target_hours and human_interventions == 0
|
|
return ConditionResult(
|
|
name="Operational Independence",
|
|
passed=passed,
|
|
actual=uptime_hours,
|
|
target=target_hours,
|
|
unit=" hours",
|
|
detail=f"Uptime: {uptime_hours}h (target: {target_hours}h), interventions: {human_interventions}",
|
|
)
|
|
|
|
|
|
# ── Full graduation test ─────────────────────────────────────────────────────
|
|
|
|
|
|
def run_graduation_test(
|
|
sats_earned: float = 0.0,
|
|
sats_spent: float = 0.0,
|
|
uptime_hours: float = 0.0,
|
|
human_interventions: int = 0,
|
|
) -> GraduationReport:
|
|
"""Run the full 5-condition graduation test.
|
|
|
|
Parameters for economic and operational independence must be supplied
|
|
by the caller since they depend on external systems (Lightning ledger,
|
|
heartbeat monitor).
|
|
|
|
Returns
|
|
-------
|
|
GraduationReport
|
|
Full report with per-condition results and overall pass/fail.
|
|
"""
|
|
conditions = [
|
|
evaluate_perception_independence(),
|
|
evaluate_decision_independence(),
|
|
evaluate_narration_independence(),
|
|
evaluate_economic_independence(sats_earned, sats_spent),
|
|
evaluate_operational_independence(uptime_hours, human_interventions=human_interventions),
|
|
]
|
|
|
|
all_passed = all(c.passed for c in conditions)
|
|
|
|
report = GraduationReport(
|
|
all_passed=all_passed,
|
|
conditions=conditions,
|
|
metadata={
|
|
"sats_earned": sats_earned,
|
|
"sats_spent": sats_spent,
|
|
"uptime_hours": uptime_hours,
|
|
"human_interventions": human_interventions,
|
|
},
|
|
)
|
|
|
|
if all_passed:
|
|
logger.info("GRADUATION TEST PASSED — all 5 conditions met simultaneously")
|
|
else:
|
|
failed = [c.name for c in conditions if not c.passed]
|
|
logger.info(
|
|
"Graduation test: %d/5 passed. Failed: %s",
|
|
len(conditions) - len(failed),
|
|
", ".join(failed),
|
|
)
|
|
|
|
return report
|
|
|
|
|
|
def persist_graduation_report(report: GraduationReport) -> Path:
|
|
"""Save a graduation report to ``data/graduation_reports/``."""
|
|
reports_dir = Path(settings.repo_root) / "data" / "graduation_reports"
|
|
reports_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
|
path = reports_dir / f"graduation_{timestamp}.json"
|
|
|
|
try:
|
|
with path.open("w") as f:
|
|
json.dump(report.to_dict(), f, indent=2, default=str)
|
|
logger.info("Graduation report saved to %s", path)
|
|
except Exception as exc:
|
|
logger.warning("Failed to persist graduation report: %s", exc)
|
|
|
|
return path
|