src/timmy/loop_qa.py

"""Loop QA — structured self-test framework for Timmy's capabilities.

Runs alongside (not inside) the thinking loop. Each cycle probes one
capability in round-robin, logs results via event_log, tracks failures
in memory, and files upgrade tasks via create_task() when degradation
is detected.

Reuses existing infrastructure:
- swarm.event_log.log_event / EventType  → result persistence
- swarm.task_queue.models.create_task    → upgrade proposals
- infrastructure.error_capture           → crash handling

Usage::

    from timmy.loop_qa import loop_qa_orchestrator

    await loop_qa_orchestrator.run_next_test()
    snapshot = loop_qa_orchestrator.get_health_snapshot()
"""

import asyncio
import logging
import uuid
from datetime import UTC, datetime
from enum import StrEnum

from config import settings

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Data models
# ---------------------------------------------------------------------------


class Capability(StrEnum):
    """Capabilities exercised by self-test probes."""

    TOOL_USE = "tool_use"
    MULTISTEP_PLANNING = "multistep_planning"
    MEMORY_READ = "memory_read"
    MEMORY_WRITE = "memory_write"
    SELF_CODING = "self_coding"
    LIGHTNING_ECON = "lightning_econ"


# ---------------------------------------------------------------------------
# Lazy accessors (avoid import-time side effects)
# ---------------------------------------------------------------------------


def _get_shell_hand():
    """Lazy-import the shell hand singleton."""
    from infrastructure.hands.shell import shell_hand

    return shell_hand


def _get_vault():
    """Lazy-import the vault memory singleton."""
    from timmy.memory_system import get_memory_system

    return get_memory_system().vault


def _get_brain_memory():
    """Return None — brain module removed.

    Memory operations now go through timmy.memory_system.
    """
    return None


# ---------------------------------------------------------------------------
# Six self-test probes — each returns a result dict
# ---------------------------------------------------------------------------


async def probe_tool_use() -> dict:
    """T1: call shell_hand.run('ls') and confirm non-empty result."""
    cap = Capability.TOOL_USE
    try:
        hand = _get_shell_hand()
        result = await hand.run("ls")
        if result.success and result.stdout.strip():
            return {
                "success": True,
                "capability": cap,
                "details": f"ls returned {len(result.stdout.splitlines())} lines",
                "error_type": None,
            }
        return {
            "success": False,
            "capability": cap,
            "details": f"ls returned empty or failed: {result.stderr[:100]}",
            "error_type": "empty_result",
        }
    except Exception as exc:
        logger.exception("Tool use probe failed")
        return {
            "success": False,
            "capability": cap,
            "details": str(exc)[:200],
            "error_type": type(exc).__name__,
        }


async def probe_multistep_planning() -> dict:
    """T2: write a temp vault note and verify it exists with content."""
    cap = Capability.MULTISTEP_PLANNING
    try:
        vault = _get_vault()
        marker = f"loop_qa_plan_test_{uuid.uuid4().hex[:8]}"
        content = (
            f"# Loop QA Planning Test\n\nMarker: {marker}\nDate: {datetime.now(UTC).isoformat()}"
        )
        path = await asyncio.to_thread(vault.write_note, "loop_qa_test", content, "notes")
        if path.exists() and marker in path.read_text():
            return {
                "success": True,
                "capability": cap,
                "details": f"Wrote and verified {path.name}",
                "error_type": None,
            }
        return {
            "success": False,
            "capability": cap,
            "details": "File missing or content mismatch",
            "error_type": "verification_failed",
        }
    except Exception as exc:
        logger.exception("Multistep planning probe failed")
        return {
            "success": False,
            "capability": cap,
            "details": str(exc)[:200],
            "error_type": type(exc).__name__,
        }


async def probe_memory_write() -> dict:
    """T3: call brain.store_fact_sync and verify no exception."""
    cap = Capability.MEMORY_WRITE
    try:
        mem = _get_brain_memory()
        marker = f"loop_qa_marker_{uuid.uuid4().hex[:8]}"
        await asyncio.to_thread(mem.store_fact_sync, "self_test_marker", marker)
        return {
            "success": True,
            "capability": cap,
            "details": f"Stored fact: {marker}",
            "error_type": None,
        }
    except Exception as exc:
        logger.exception("Memory write probe failed")
        return {
            "success": False,
            "capability": cap,
            "details": str(exc)[:200],
            "error_type": type(exc).__name__,
        }


async def probe_memory_read() -> dict:
    """T4: call brain.get_facts_sync and verify results returned."""
    cap = Capability.MEMORY_READ
    try:
        mem = _get_brain_memory()
        facts = await asyncio.to_thread(mem.get_facts_sync, "self_test_marker")
        if facts:
            return {
                "success": True,
                "capability": cap,
                "details": f"Retrieved {len(facts)} self_test_marker facts",
                "error_type": None,
            }
        return {
            "success": False,
            "capability": cap,
            "details": "No self_test_marker facts found",
            "error_type": "empty_result",
        }
    except Exception as exc:
        logger.exception("Memory read probe failed")
        return {
            "success": False,
            "capability": cap,
            "details": str(exc)[:200],
            "error_type": type(exc).__name__,
        }


async def probe_self_coding() -> dict:
    """T5: write a self-test note to memory/self/ via vault."""
    cap = Capability.SELF_CODING
    try:
        vault = _get_vault()
        content = (
            "# Self-Test Improvement Note\n\n"
            f"**Generated:** {datetime.now(UTC).isoformat()}\n\n"
            "## What\nLoop QA self-coding probe — validates vault write capability.\n\n"
            "## Why\nEnsure the self-coding pathway is functional.\n\n"
            "## How\nWrite this note and verify it exists."
        )
        path = await asyncio.to_thread(vault.write_note, "self_test_note", content, "self")
        if path.exists() and path.stat().st_size > 0:
            return {
                "success": True,
                "capability": cap,
                "details": f"Wrote {path.name} ({path.stat().st_size} bytes)",
                "error_type": None,
            }
        return {
            "success": False,
            "capability": cap,
            "details": "File missing or empty after write",
            "error_type": "verification_failed",
        }
    except Exception as exc:
        logger.exception("Self-coding probe failed")
        return {
            "success": False,
            "capability": cap,
            "details": str(exc)[:200],
            "error_type": type(exc).__name__,
        }


async def probe_lightning_econ() -> dict:
    """T6: placeholder — Lightning module pending v2."""
    return {
        "success": True,
        "capability": Capability.LIGHTNING_ECON,
        "details": "Lightning module pending v2 — placeholder pass",
        "error_type": None,
    }


# ---------------------------------------------------------------------------
# Test sequence (round-robin order)
# ---------------------------------------------------------------------------

TEST_SEQUENCE: list[tuple[Capability, str]] = [
    (Capability.TOOL_USE, "probe_tool_use"),
    (Capability.MULTISTEP_PLANNING, "probe_multistep_planning"),
    (Capability.MEMORY_WRITE, "probe_memory_write"),
    (Capability.MEMORY_READ, "probe_memory_read"),
    (Capability.SELF_CODING, "probe_self_coding"),
    (Capability.LIGHTNING_ECON, "probe_lightning_econ"),
]


# ---------------------------------------------------------------------------
# Orchestrator
# ---------------------------------------------------------------------------


def log_event(event_type, **kwargs):
    """No-op — swarm event_log removed."""
    logger.debug("log_event(%s) — swarm module removed, skipping", event_type)


def capture_error(exc, **kwargs):
    """Proxy to infrastructure.error_capture — lazy import."""
    try:
        from infrastructure.error_capture import capture_error as _capture

        return _capture(exc, **kwargs)
    except Exception as capture_exc:
        logger.debug("Failed to capture error: %s", capture_exc)
        logger.debug("Failed to capture error", exc_info=True)


def create_task(**kwargs):
    """No-op — swarm task queue removed."""
    logger.debug("create_task() — swarm module removed, skipping")
    return None


class LoopQAOrchestrator:
    """Round-robin self-test orchestrator.

    Runs one probe per invocation, cycling through T1–T6. Tracks
    consecutive failures in memory (circuit-breaker pattern) and
    files upgrade tasks via create_task() when degradation is detected.
    """

    def __init__(self) -> None:
        self._test_index: int = 0
        self._failure_counts: dict[Capability, int] = {c: 0 for c in Capability}
        self._last_failed: dict[Capability, str | None] = {c: None for c in Capability}
        self._proposal_filed: set[Capability] = set()
        self._hourly_count: int = 0
        self._hour_marker: int = -1

    async def run_next_test(self) -> dict | None:
        """Run the next probe in the round-robin sequence.

        Returns result dict, or None if disabled/throttled.
        """
        if not settings.loop_qa_enabled:
            return None

        # Hourly throttle
        now = datetime.now(UTC)
        current_hour = now.hour
        if current_hour != self._hour_marker:
            self._hourly_count = 0
            self._hour_marker = current_hour

        if self._hourly_count >= settings.loop_qa_max_per_hour:
            logger.debug(
                "Loop QA throttled: %d/%d this hour",
                self._hourly_count,
                settings.loop_qa_max_per_hour,
            )
            return None

        # Pick next probe (resolve name at call time for testability)
        import timmy.loop_qa as _self_module

        cap, probe_name = TEST_SEQUENCE[self._test_index]
        probe_fn = getattr(_self_module, probe_name)
        self._test_index = (self._test_index + 1) % len(TEST_SEQUENCE)
        self._hourly_count += 1

        # Run probe
        try:
            result = await probe_fn()
        except Exception as exc:
            # Probe itself crashed — record failure and report
            logger.exception("Loop QA probe %s crashed", cap.value)
            capture_error(exc, source="loop_qa", context={"capability": cap.value})
            result = {
                "success": False,
                "capability": cap,
                "details": f"Probe crashed: {exc!s}"[:200],
                "error_type": type(exc).__name__,
            }

        # Log result
        event_type = "loop_qa_ok" if result["success"] else "loop_qa_fail"
        log_event(
            event_type,
            source="loop_qa",
            data={
                "capability": cap.value,
                "details": result.get("details", ""),
                "error_type": result.get("error_type"),
            },
        )

        # Update failure counter
        if result["success"]:
            self._failure_counts[cap] = 0
            self._last_failed[cap] = None
            self._proposal_filed.discard(cap)
        else:
            self._failure_counts[cap] += 1
            self._last_failed[cap] = now.isoformat()
            self._maybe_file_upgrade(cap)

        return result

    def _maybe_file_upgrade(self, cap: Capability) -> None:
        """File an upgrade task if threshold is reached and not already filed."""
        count = self._failure_counts[cap]
        if count < settings.loop_qa_upgrade_threshold:
            return
        if cap in self._proposal_filed:
            return

        try:
            title = f"Stabilize {cap.value.upper()}: self-test failing {count}x in a row"
            description = (
                f"Loop QA detected {count} consecutive failures "
                f"for capability '{cap.value}'.\n\n"
                f"Last failure: {self._last_failed[cap]}\n"
                f"Action: investigate root cause and restore capability."
            )
            create_task(
                title=title,
                description=description,
                priority="high",
                created_by="timmy_loop_qa",
                task_type="loop_qa_upgrade",
            )
            self._proposal_filed.add(cap)
            logger.info("Filed upgrade proposal for %s: %s", cap.value, title)
        except Exception as exc:
            logger.warning("Failed to file upgrade proposal: %s", exc)

    def get_health_snapshot(self) -> dict:
        """Build a health snapshot from in-memory failure counters."""
        capabilities = []
        for cap in Capability:
            count = self._failure_counts.get(cap, 0)
            capabilities.append(
                {
                    "capability": cap,
                    "status": self.status_for_failures(count),
                    "last_failed_at": self._last_failed.get(cap),
                    "consecutive_failures": count,
                }
            )

        statuses = [c["status"] for c in capabilities]
        if "red" in statuses:
            overall = "red"
        elif "yellow" in statuses:
            overall = "yellow"
        else:
            overall = "green"

        return {
            "generated_at": datetime.now(UTC).isoformat(),
            "overall_status": overall,
            "capabilities": capabilities,
        }

    @staticmethod
    def status_for_failures(count: int) -> str:
        """Map consecutive failure count to green/yellow/red."""
        if count >= settings.loop_qa_upgrade_threshold:
            return "red"
        elif count >= 2:
            return "yellow"
        return "green"


# ── Module singleton ─────────────────────────────────────────────────────────

loop_qa_orchestrator = LoopQAOrchestrator()