Timmy-time-dashboard/src/timmy/sovereignty/graduation.py

"""Graduation Test — Falsework Removal Criteria.

Evaluates whether the agent meets all five graduation conditions
simultaneously.  All conditions must be met within a single 24-hour
period for the system to be considered sovereign.

Conditions:
  1. Perception Independence — 1 hour with no VLM calls after minute 15
  2. Decision Independence   — Full session with <5 cloud API calls
  3. Narration Independence  — All narration from local templates + local LLM
  4. Economic Independence   — sats_earned > sats_spent
  5. Operational Independence — 24 hours unattended, no human intervention

Each condition returns a :class:`GraduationResult` with pass/fail,
the actual measured value, and the target.

    "The arch must hold after the falsework is removed."

Refs: #953 (The Sovereignty Loop — Graduation Test)
"""

from __future__ import annotations

import json
import logging
from dataclasses import asdict, dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from config import settings

logger = logging.getLogger(__name__)


# ── Data classes ──────────────────────────────────────────────────────────────


@dataclass
class ConditionResult:
    """Result of a single graduation condition evaluation."""

    name: str
    passed: bool
    actual: float | int
    target: float | int
    unit: str = ""
    detail: str = ""


@dataclass
class GraduationReport:
    """Full graduation test report."""

    timestamp: str = field(default_factory=lambda: datetime.now(UTC).isoformat())
    all_passed: bool = False
    conditions: list[ConditionResult] = field(default_factory=list)
    metadata: dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> dict[str, Any]:
        """Serialize to a JSON-safe dict."""
        return {
            "timestamp": self.timestamp,
            "all_passed": self.all_passed,
            "conditions": [asdict(c) for c in self.conditions],
            "metadata": self.metadata,
        }

    def to_markdown(self) -> str:
        """Render the report as a markdown string."""
        status = "PASSED ✓" if self.all_passed else "NOT YET"
        lines = [
            "# Graduation Test Report",
            "",
            f"**Status:** {status}",
            f"**Evaluated:** {self.timestamp}",
            "",
            "| # | Condition | Target | Actual | Result |",
            "|---|-----------|--------|--------|--------|",
        ]
        for i, c in enumerate(self.conditions, 1):
            result_str = "PASS" if c.passed else "FAIL"
            actual_str = f"{c.actual}{c.unit}" if c.unit else str(c.actual)
            target_str = f"{c.target}{c.unit}" if c.unit else str(c.target)
            lines.append(f"| {i} | {c.name} | {target_str} | {actual_str} | {result_str} |")

        lines.append("")
        for c in self.conditions:
            if c.detail:
                lines.append(f"- **{c.name}**: {c.detail}")

        lines.append("")
        lines.append('> "The arch must hold after the falsework is removed."')
        return "\n".join(lines)


# ── Evaluation functions ──────────────────────────────────────────────────────


def evaluate_perception_independence(
    time_window_seconds: float = 3600.0,
    warmup_seconds: float = 900.0,
) -> ConditionResult:
    """Test 1: No VLM calls after the first 15 minutes of a 1-hour window.

    Parameters
    ----------
    time_window_seconds:
        Total window to evaluate (default: 1 hour).
    warmup_seconds:
        Initial warmup period where VLM calls are expected (default: 15 min).
    """
    from timmy.sovereignty.metrics import get_metrics_store

    store = get_metrics_store()

    # Count VLM calls in the post-warmup period
    # We query all events in the window, then filter by timestamp
    try:
        from contextlib import closing

        from timmy.sovereignty.metrics import _seconds_ago_iso

        cutoff_total = _seconds_ago_iso(time_window_seconds)
        cutoff_warmup = _seconds_ago_iso(time_window_seconds - warmup_seconds)

        with closing(store._connect()) as conn:
            vlm_calls_after_warmup = conn.execute(
                "SELECT COUNT(*) FROM events WHERE event_type = 'perception_vlm_call' "
                "AND timestamp >= ? AND timestamp < ?",
                (cutoff_total, cutoff_warmup),
            ).fetchone()[0]
    except Exception as exc:
        logger.warning("Failed to evaluate perception independence: %s", exc)
        vlm_calls_after_warmup = -1

    passed = vlm_calls_after_warmup == 0
    return ConditionResult(
        name="Perception Independence",
        passed=passed,
        actual=vlm_calls_after_warmup,
        target=0,
        unit=" VLM calls",
        detail=f"VLM calls in last {int((time_window_seconds - warmup_seconds) / 60)} min: {vlm_calls_after_warmup}",
    )


def evaluate_decision_independence(
    max_api_calls: int = 5,
) -> ConditionResult:
    """Test 2: Full session with <5 cloud API calls total.

    Counts ``decision_llm_call`` events in the current session.
    """
    from timmy.sovereignty.metrics import get_metrics_store

    store = get_metrics_store()

    try:
        from contextlib import closing

        with closing(store._connect()) as conn:
            # Count LLM calls in the last 24 hours
            from timmy.sovereignty.metrics import _seconds_ago_iso

            cutoff = _seconds_ago_iso(86400.0)
            api_calls = conn.execute(
                "SELECT COUNT(*) FROM events WHERE event_type IN "
                "('decision_llm_call', 'api_call') AND timestamp >= ?",
                (cutoff,),
            ).fetchone()[0]
    except Exception as exc:
        logger.warning("Failed to evaluate decision independence: %s", exc)
        api_calls = -1

    passed = 0 <= api_calls < max_api_calls
    return ConditionResult(
        name="Decision Independence",
        passed=passed,
        actual=api_calls,
        target=max_api_calls,
        unit=" calls",
        detail=f"Cloud API calls in last 24h: {api_calls} (target: <{max_api_calls})",
    )


def evaluate_narration_independence() -> ConditionResult:
    """Test 3: All narration from local templates + local LLM (zero cloud calls).

    Checks that ``narration_llm`` events are zero in the last 24 hours
    while ``narration_template`` events are non-zero.
    """
    from timmy.sovereignty.metrics import get_metrics_store

    store = get_metrics_store()

    try:
        from contextlib import closing

        from timmy.sovereignty.metrics import _seconds_ago_iso

        cutoff = _seconds_ago_iso(86400.0)

        with closing(store._connect()) as conn:
            cloud_narrations = conn.execute(
                "SELECT COUNT(*) FROM events WHERE event_type = 'narration_llm' AND timestamp >= ?",
                (cutoff,),
            ).fetchone()[0]
            local_narrations = conn.execute(
                "SELECT COUNT(*) FROM events WHERE event_type = 'narration_template' "
                "AND timestamp >= ?",
                (cutoff,),
            ).fetchone()[0]
    except Exception as exc:
        logger.warning("Failed to evaluate narration independence: %s", exc)
        cloud_narrations = -1
        local_narrations = 0

    passed = cloud_narrations == 0 and local_narrations > 0
    return ConditionResult(
        name="Narration Independence",
        passed=passed,
        actual=cloud_narrations,
        target=0,
        unit=" cloud calls",
        detail=f"Cloud narration calls: {cloud_narrations}, local: {local_narrations}",
    )


def evaluate_economic_independence(
    sats_earned: float = 0.0,
    sats_spent: float = 0.0,
) -> ConditionResult:
    """Test 4: sats_earned > sats_spent.

    Parameters are passed in because sat tracking may live in a separate
    ledger (Lightning, #851).
    """
    passed = sats_earned > sats_spent and sats_earned > 0
    net = sats_earned - sats_spent
    return ConditionResult(
        name="Economic Independence",
        passed=passed,
        actual=net,
        target=0,
        unit=" sats net",
        detail=f"Earned: {sats_earned} sats, spent: {sats_spent} sats, net: {net}",
    )


def evaluate_operational_independence(
    uptime_hours: float = 0.0,
    target_hours: float = 23.5,
    human_interventions: int = 0,
) -> ConditionResult:
    """Test 5: 24 hours unattended, no human intervention.

    Uptime and intervention count are passed in from the heartbeat
    system (#872).
    """
    passed = uptime_hours >= target_hours and human_interventions == 0
    return ConditionResult(
        name="Operational Independence",
        passed=passed,
        actual=uptime_hours,
        target=target_hours,
        unit=" hours",
        detail=f"Uptime: {uptime_hours}h (target: {target_hours}h), interventions: {human_interventions}",
    )


# ── Full graduation test ─────────────────────────────────────────────────────


def run_graduation_test(
    sats_earned: float = 0.0,
    sats_spent: float = 0.0,
    uptime_hours: float = 0.0,
    human_interventions: int = 0,
) -> GraduationReport:
    """Run the full 5-condition graduation test.

    Parameters for economic and operational independence must be supplied
    by the caller since they depend on external systems (Lightning ledger,
    heartbeat monitor).

    Returns
    -------
    GraduationReport
        Full report with per-condition results and overall pass/fail.
    """
    conditions = [
        evaluate_perception_independence(),
        evaluate_decision_independence(),
        evaluate_narration_independence(),
        evaluate_economic_independence(sats_earned, sats_spent),
        evaluate_operational_independence(uptime_hours, human_interventions=human_interventions),
    ]

    all_passed = all(c.passed for c in conditions)

    report = GraduationReport(
        all_passed=all_passed,
        conditions=conditions,
        metadata={
            "sats_earned": sats_earned,
            "sats_spent": sats_spent,
            "uptime_hours": uptime_hours,
            "human_interventions": human_interventions,
        },
    )

    if all_passed:
        logger.info("GRADUATION TEST PASSED — all 5 conditions met simultaneously")
    else:
        failed = [c.name for c in conditions if not c.passed]
        logger.info(
            "Graduation test: %d/5 passed. Failed: %s",
            len(conditions) - len(failed),
            ", ".join(failed),
        )

    return report


def persist_graduation_report(report: GraduationReport) -> Path:
    """Save a graduation report to ``data/graduation_reports/``."""
    reports_dir = Path(settings.repo_root) / "data" / "graduation_reports"
    reports_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
    path = reports_dir / f"graduation_{timestamp}.json"

    try:
        with path.open("w") as f:
            json.dump(report.to_dict(), f, indent=2, default=str)
        logger.info("Graduation report saved to %s", path)
    except Exception as exc:
        logger.warning("Failed to persist graduation report: %s", exc)

    return path