Files
Timmy-time-dashboard/src/timmy/sovereignty/graduation.py
Perplexity Computer 4ec4558a2f
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[perplexity] feat: Sovereignty Loop core framework — auto-crystallizer, graduation test, orchestration (#953) (#1331)
Co-authored-by: Perplexity Computer <perplexity@tower.local>
Co-committed-by: Perplexity Computer <perplexity@tower.local>
2026-03-24 02:29:39 +00:00

342 lines
11 KiB
Python

"""Graduation Test — Falsework Removal Criteria.
Evaluates whether the agent meets all five graduation conditions
simultaneously. All conditions must be met within a single 24-hour
period for the system to be considered sovereign.
Conditions:
1. Perception Independence — 1 hour with no VLM calls after minute 15
2. Decision Independence — Full session with <5 cloud API calls
3. Narration Independence — All narration from local templates + local LLM
4. Economic Independence — sats_earned > sats_spent
5. Operational Independence — 24 hours unattended, no human intervention
Each condition returns a :class:`GraduationResult` with pass/fail,
the actual measured value, and the target.
"The arch must hold after the falsework is removed."
Refs: #953 (The Sovereignty Loop — Graduation Test)
"""
from __future__ import annotations
import json
import logging
from dataclasses import asdict, dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from config import settings
logger = logging.getLogger(__name__)
# ── Data classes ──────────────────────────────────────────────────────────────
@dataclass
class ConditionResult:
"""Result of a single graduation condition evaluation."""
name: str
passed: bool
actual: float | int
target: float | int
unit: str = ""
detail: str = ""
@dataclass
class GraduationReport:
"""Full graduation test report."""
timestamp: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
all_passed: bool = False
conditions: list[ConditionResult] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
"""Serialize to a JSON-safe dict."""
return {
"timestamp": self.timestamp,
"all_passed": self.all_passed,
"conditions": [asdict(c) for c in self.conditions],
"metadata": self.metadata,
}
def to_markdown(self) -> str:
"""Render the report as a markdown string."""
status = "PASSED ✓" if self.all_passed else "NOT YET"
lines = [
"# Graduation Test Report",
"",
f"**Status:** {status}",
f"**Evaluated:** {self.timestamp}",
"",
"| # | Condition | Target | Actual | Result |",
"|---|-----------|--------|--------|--------|",
]
for i, c in enumerate(self.conditions, 1):
result_str = "PASS" if c.passed else "FAIL"
actual_str = f"{c.actual}{c.unit}" if c.unit else str(c.actual)
target_str = f"{c.target}{c.unit}" if c.unit else str(c.target)
lines.append(f"| {i} | {c.name} | {target_str} | {actual_str} | {result_str} |")
lines.append("")
for c in self.conditions:
if c.detail:
lines.append(f"- **{c.name}**: {c.detail}")
lines.append("")
lines.append('> "The arch must hold after the falsework is removed."')
return "\n".join(lines)
# ── Evaluation functions ──────────────────────────────────────────────────────
def evaluate_perception_independence(
time_window_seconds: float = 3600.0,
warmup_seconds: float = 900.0,
) -> ConditionResult:
"""Test 1: No VLM calls after the first 15 minutes of a 1-hour window.
Parameters
----------
time_window_seconds:
Total window to evaluate (default: 1 hour).
warmup_seconds:
Initial warmup period where VLM calls are expected (default: 15 min).
"""
from timmy.sovereignty.metrics import get_metrics_store
store = get_metrics_store()
# Count VLM calls in the post-warmup period
# We query all events in the window, then filter by timestamp
try:
from contextlib import closing
from timmy.sovereignty.metrics import _seconds_ago_iso
cutoff_total = _seconds_ago_iso(time_window_seconds)
cutoff_warmup = _seconds_ago_iso(time_window_seconds - warmup_seconds)
with closing(store._connect()) as conn:
vlm_calls_after_warmup = conn.execute(
"SELECT COUNT(*) FROM events WHERE event_type = 'perception_vlm_call' "
"AND timestamp >= ? AND timestamp < ?",
(cutoff_total, cutoff_warmup),
).fetchone()[0]
except Exception as exc:
logger.warning("Failed to evaluate perception independence: %s", exc)
vlm_calls_after_warmup = -1
passed = vlm_calls_after_warmup == 0
return ConditionResult(
name="Perception Independence",
passed=passed,
actual=vlm_calls_after_warmup,
target=0,
unit=" VLM calls",
detail=f"VLM calls in last {int((time_window_seconds - warmup_seconds) / 60)} min: {vlm_calls_after_warmup}",
)
def evaluate_decision_independence(
max_api_calls: int = 5,
) -> ConditionResult:
"""Test 2: Full session with <5 cloud API calls total.
Counts ``decision_llm_call`` events in the current session.
"""
from timmy.sovereignty.metrics import get_metrics_store
store = get_metrics_store()
try:
from contextlib import closing
with closing(store._connect()) as conn:
# Count LLM calls in the last 24 hours
from timmy.sovereignty.metrics import _seconds_ago_iso
cutoff = _seconds_ago_iso(86400.0)
api_calls = conn.execute(
"SELECT COUNT(*) FROM events WHERE event_type IN "
"('decision_llm_call', 'api_call') AND timestamp >= ?",
(cutoff,),
).fetchone()[0]
except Exception as exc:
logger.warning("Failed to evaluate decision independence: %s", exc)
api_calls = -1
passed = 0 <= api_calls < max_api_calls
return ConditionResult(
name="Decision Independence",
passed=passed,
actual=api_calls,
target=max_api_calls,
unit=" calls",
detail=f"Cloud API calls in last 24h: {api_calls} (target: <{max_api_calls})",
)
def evaluate_narration_independence() -> ConditionResult:
"""Test 3: All narration from local templates + local LLM (zero cloud calls).
Checks that ``narration_llm`` events are zero in the last 24 hours
while ``narration_template`` events are non-zero.
"""
from timmy.sovereignty.metrics import get_metrics_store
store = get_metrics_store()
try:
from contextlib import closing
from timmy.sovereignty.metrics import _seconds_ago_iso
cutoff = _seconds_ago_iso(86400.0)
with closing(store._connect()) as conn:
cloud_narrations = conn.execute(
"SELECT COUNT(*) FROM events WHERE event_type = 'narration_llm' AND timestamp >= ?",
(cutoff,),
).fetchone()[0]
local_narrations = conn.execute(
"SELECT COUNT(*) FROM events WHERE event_type = 'narration_template' "
"AND timestamp >= ?",
(cutoff,),
).fetchone()[0]
except Exception as exc:
logger.warning("Failed to evaluate narration independence: %s", exc)
cloud_narrations = -1
local_narrations = 0
passed = cloud_narrations == 0 and local_narrations > 0
return ConditionResult(
name="Narration Independence",
passed=passed,
actual=cloud_narrations,
target=0,
unit=" cloud calls",
detail=f"Cloud narration calls: {cloud_narrations}, local: {local_narrations}",
)
def evaluate_economic_independence(
sats_earned: float = 0.0,
sats_spent: float = 0.0,
) -> ConditionResult:
"""Test 4: sats_earned > sats_spent.
Parameters are passed in because sat tracking may live in a separate
ledger (Lightning, #851).
"""
passed = sats_earned > sats_spent and sats_earned > 0
net = sats_earned - sats_spent
return ConditionResult(
name="Economic Independence",
passed=passed,
actual=net,
target=0,
unit=" sats net",
detail=f"Earned: {sats_earned} sats, spent: {sats_spent} sats, net: {net}",
)
def evaluate_operational_independence(
uptime_hours: float = 0.0,
target_hours: float = 23.5,
human_interventions: int = 0,
) -> ConditionResult:
"""Test 5: 24 hours unattended, no human intervention.
Uptime and intervention count are passed in from the heartbeat
system (#872).
"""
passed = uptime_hours >= target_hours and human_interventions == 0
return ConditionResult(
name="Operational Independence",
passed=passed,
actual=uptime_hours,
target=target_hours,
unit=" hours",
detail=f"Uptime: {uptime_hours}h (target: {target_hours}h), interventions: {human_interventions}",
)
# ── Full graduation test ─────────────────────────────────────────────────────
def run_graduation_test(
sats_earned: float = 0.0,
sats_spent: float = 0.0,
uptime_hours: float = 0.0,
human_interventions: int = 0,
) -> GraduationReport:
"""Run the full 5-condition graduation test.
Parameters for economic and operational independence must be supplied
by the caller since they depend on external systems (Lightning ledger,
heartbeat monitor).
Returns
-------
GraduationReport
Full report with per-condition results and overall pass/fail.
"""
conditions = [
evaluate_perception_independence(),
evaluate_decision_independence(),
evaluate_narration_independence(),
evaluate_economic_independence(sats_earned, sats_spent),
evaluate_operational_independence(uptime_hours, human_interventions=human_interventions),
]
all_passed = all(c.passed for c in conditions)
report = GraduationReport(
all_passed=all_passed,
conditions=conditions,
metadata={
"sats_earned": sats_earned,
"sats_spent": sats_spent,
"uptime_hours": uptime_hours,
"human_interventions": human_interventions,
},
)
if all_passed:
logger.info("GRADUATION TEST PASSED — all 5 conditions met simultaneously")
else:
failed = [c.name for c in conditions if not c.passed]
logger.info(
"Graduation test: %d/5 passed. Failed: %s",
len(conditions) - len(failed),
", ".join(failed),
)
return report
def persist_graduation_report(report: GraduationReport) -> Path:
"""Save a graduation report to ``data/graduation_reports/``."""
reports_dir = Path(settings.repo_root) / "data" / "graduation_reports"
reports_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
path = reports_dir / f"graduation_{timestamp}.json"
try:
with path.open("w") as f:
json.dump(report.to_dict(), f, indent=2, default=str)
logger.info("Graduation report saved to %s", path)
except Exception as exc:
logger.warning("Failed to persist graduation report: %s", exc)
return path