feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
"""Loop QA — structured self-test framework for Timmy's capabilities.
|
|
|
|
|
|
|
|
|
|
|
|
Runs alongside (not inside) the thinking loop. Each cycle probes one
|
|
|
|
|
|
capability in round-robin, logs results via event_log, tracks failures
|
|
|
|
|
|
in memory, and files upgrade tasks via create_task() when degradation
|
|
|
|
|
|
is detected.
|
|
|
|
|
|
|
|
|
|
|
|
Reuses existing infrastructure:
|
|
|
|
|
|
- swarm.event_log.log_event / EventType → result persistence
|
|
|
|
|
|
- swarm.task_queue.models.create_task → upgrade proposals
|
|
|
|
|
|
- infrastructure.error_capture → crash handling
|
|
|
|
|
|
|
|
|
|
|
|
Usage::
|
|
|
|
|
|
|
|
|
|
|
|
from timmy.loop_qa import loop_qa_orchestrator
|
|
|
|
|
|
|
|
|
|
|
|
await loop_qa_orchestrator.run_next_test()
|
|
|
|
|
|
snapshot = loop_qa_orchestrator.get_health_snapshot()
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
|
import logging
|
|
|
|
|
|
import uuid
|
|
|
|
|
|
from datetime import UTC, datetime
|
|
|
|
|
|
from enum import StrEnum
|
|
|
|
|
|
|
|
|
|
|
|
from config import settings
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Data models
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Capability(StrEnum):
|
|
|
|
|
|
"""Capabilities exercised by self-test probes."""
|
|
|
|
|
|
|
|
|
|
|
|
TOOL_USE = "tool_use"
|
|
|
|
|
|
MULTISTEP_PLANNING = "multistep_planning"
|
|
|
|
|
|
MEMORY_READ = "memory_read"
|
|
|
|
|
|
MEMORY_WRITE = "memory_write"
|
|
|
|
|
|
SELF_CODING = "self_coding"
|
|
|
|
|
|
LIGHTNING_ECON = "lightning_econ"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Lazy accessors (avoid import-time side effects)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_shell_hand():
|
|
|
|
|
|
"""Lazy-import the shell hand singleton."""
|
|
|
|
|
|
from infrastructure.hands.shell import shell_hand
|
|
|
|
|
|
|
|
|
|
|
|
return shell_hand
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_vault():
|
|
|
|
|
|
"""Lazy-import the vault memory singleton."""
|
|
|
|
|
|
from timmy.memory_system import get_memory_system
|
|
|
|
|
|
|
|
|
|
|
|
return get_memory_system().vault
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_brain_memory():
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
"""Return None — brain module removed.
|
2026-03-12 11:23:18 -04:00
|
|
|
|
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
Memory operations now go through timmy.memory_system.
|
2026-03-12 11:23:18 -04:00
|
|
|
|
"""
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
return None
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Six self-test probes — each returns a result dict
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_tool_use() -> dict:
|
|
|
|
|
|
"""T1: call shell_hand.run('ls') and confirm non-empty result."""
|
|
|
|
|
|
cap = Capability.TOOL_USE
|
|
|
|
|
|
try:
|
|
|
|
|
|
hand = _get_shell_hand()
|
|
|
|
|
|
result = await hand.run("ls")
|
|
|
|
|
|
if result.success and result.stdout.strip():
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"ls returned {len(result.stdout.splitlines())} lines",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"ls returned empty or failed: {result.stderr[:100]}",
|
|
|
|
|
|
"error_type": "empty_result",
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as exc:
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Tool use probe failed")
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": str(exc)[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_multistep_planning() -> dict:
|
|
|
|
|
|
"""T2: write a temp vault note and verify it exists with content."""
|
|
|
|
|
|
cap = Capability.MULTISTEP_PLANNING
|
|
|
|
|
|
try:
|
|
|
|
|
|
vault = _get_vault()
|
|
|
|
|
|
marker = f"loop_qa_plan_test_{uuid.uuid4().hex[:8]}"
|
|
|
|
|
|
content = (
|
|
|
|
|
|
f"# Loop QA Planning Test\n\nMarker: {marker}\nDate: {datetime.now(UTC).isoformat()}"
|
|
|
|
|
|
)
|
|
|
|
|
|
path = await asyncio.to_thread(vault.write_note, "loop_qa_test", content, "notes")
|
|
|
|
|
|
if path.exists() and marker in path.read_text():
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"Wrote and verified {path.name}",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": "File missing or content mismatch",
|
|
|
|
|
|
"error_type": "verification_failed",
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as exc:
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Multistep planning probe failed")
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": str(exc)[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_memory_write() -> dict:
|
|
|
|
|
|
"""T3: call brain.store_fact_sync and verify no exception."""
|
|
|
|
|
|
cap = Capability.MEMORY_WRITE
|
|
|
|
|
|
try:
|
|
|
|
|
|
mem = _get_brain_memory()
|
|
|
|
|
|
marker = f"loop_qa_marker_{uuid.uuid4().hex[:8]}"
|
|
|
|
|
|
await asyncio.to_thread(mem.store_fact_sync, "self_test_marker", marker)
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"Stored fact: {marker}",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as exc:
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Memory write probe failed")
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": str(exc)[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_memory_read() -> dict:
|
|
|
|
|
|
"""T4: call brain.get_facts_sync and verify results returned."""
|
|
|
|
|
|
cap = Capability.MEMORY_READ
|
|
|
|
|
|
try:
|
|
|
|
|
|
mem = _get_brain_memory()
|
|
|
|
|
|
facts = await asyncio.to_thread(mem.get_facts_sync, "self_test_marker")
|
|
|
|
|
|
if facts:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"Retrieved {len(facts)} self_test_marker facts",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": "No self_test_marker facts found",
|
|
|
|
|
|
"error_type": "empty_result",
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as exc:
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Memory read probe failed")
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": str(exc)[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_self_coding() -> dict:
|
|
|
|
|
|
"""T5: write a self-test note to memory/self/ via vault."""
|
|
|
|
|
|
cap = Capability.SELF_CODING
|
|
|
|
|
|
try:
|
|
|
|
|
|
vault = _get_vault()
|
|
|
|
|
|
content = (
|
|
|
|
|
|
"# Self-Test Improvement Note\n\n"
|
|
|
|
|
|
f"**Generated:** {datetime.now(UTC).isoformat()}\n\n"
|
|
|
|
|
|
"## What\nLoop QA self-coding probe — validates vault write capability.\n\n"
|
|
|
|
|
|
"## Why\nEnsure the self-coding pathway is functional.\n\n"
|
|
|
|
|
|
"## How\nWrite this note and verify it exists."
|
|
|
|
|
|
)
|
|
|
|
|
|
path = await asyncio.to_thread(vault.write_note, "self_test_note", content, "self")
|
|
|
|
|
|
if path.exists() and path.stat().st_size > 0:
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"Wrote {path.name} ({path.stat().st_size} bytes)",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": "File missing or empty after write",
|
|
|
|
|
|
"error_type": "verification_failed",
|
|
|
|
|
|
}
|
|
|
|
|
|
except Exception as exc:
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Self-coding probe failed")
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
return {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": str(exc)[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def probe_lightning_econ() -> dict:
|
|
|
|
|
|
"""T6: placeholder — Lightning module pending v2."""
|
|
|
|
|
|
return {
|
|
|
|
|
|
"success": True,
|
|
|
|
|
|
"capability": Capability.LIGHTNING_ECON,
|
|
|
|
|
|
"details": "Lightning module pending v2 — placeholder pass",
|
|
|
|
|
|
"error_type": None,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Test sequence (round-robin order)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
TEST_SEQUENCE: list[tuple[Capability, str]] = [
|
|
|
|
|
|
(Capability.TOOL_USE, "probe_tool_use"),
|
|
|
|
|
|
(Capability.MULTISTEP_PLANNING, "probe_multistep_planning"),
|
|
|
|
|
|
(Capability.MEMORY_WRITE, "probe_memory_write"),
|
|
|
|
|
|
(Capability.MEMORY_READ, "probe_memory_read"),
|
|
|
|
|
|
(Capability.SELF_CODING, "probe_self_coding"),
|
|
|
|
|
|
(Capability.LIGHTNING_ECON, "probe_lightning_econ"),
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Orchestrator
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def log_event(event_type, **kwargs):
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
"""No-op — swarm event_log removed."""
|
|
|
|
|
|
logger.debug("log_event(%s) — swarm module removed, skipping", event_type)
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def capture_error(exc, **kwargs):
|
|
|
|
|
|
"""Proxy to infrastructure.error_capture — lazy import."""
|
|
|
|
|
|
try:
|
|
|
|
|
|
from infrastructure.error_capture import capture_error as _capture
|
|
|
|
|
|
|
|
|
|
|
|
return _capture(exc, **kwargs)
|
2026-03-14 19:07:14 -04:00
|
|
|
|
except Exception as capture_exc:
|
|
|
|
|
|
logger.debug("Failed to capture error: %s", capture_exc)
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
logger.debug("Failed to capture error", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_task(**kwargs):
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
"""No-op — swarm task queue removed."""
|
|
|
|
|
|
logger.debug("create_task() — swarm module removed, skipping")
|
|
|
|
|
|
return None
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LoopQAOrchestrator:
|
|
|
|
|
|
"""Round-robin self-test orchestrator.
|
|
|
|
|
|
|
|
|
|
|
|
Runs one probe per invocation, cycling through T1–T6. Tracks
|
|
|
|
|
|
consecutive failures in memory (circuit-breaker pattern) and
|
|
|
|
|
|
files upgrade tasks via create_task() when degradation is detected.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
|
self._test_index: int = 0
|
|
|
|
|
|
self._failure_counts: dict[Capability, int] = {c: 0 for c in Capability}
|
|
|
|
|
|
self._last_failed: dict[Capability, str | None] = {c: None for c in Capability}
|
|
|
|
|
|
self._proposal_filed: set[Capability] = set()
|
|
|
|
|
|
self._hourly_count: int = 0
|
|
|
|
|
|
self._hour_marker: int = -1
|
|
|
|
|
|
|
|
|
|
|
|
async def run_next_test(self) -> dict | None:
|
|
|
|
|
|
"""Run the next probe in the round-robin sequence.
|
|
|
|
|
|
|
|
|
|
|
|
Returns result dict, or None if disabled/throttled.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not settings.loop_qa_enabled:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# Hourly throttle
|
|
|
|
|
|
now = datetime.now(UTC)
|
|
|
|
|
|
current_hour = now.hour
|
|
|
|
|
|
if current_hour != self._hour_marker:
|
|
|
|
|
|
self._hourly_count = 0
|
|
|
|
|
|
self._hour_marker = current_hour
|
|
|
|
|
|
|
|
|
|
|
|
if self._hourly_count >= settings.loop_qa_max_per_hour:
|
|
|
|
|
|
logger.debug(
|
|
|
|
|
|
"Loop QA throttled: %d/%d this hour",
|
|
|
|
|
|
self._hourly_count,
|
|
|
|
|
|
settings.loop_qa_max_per_hour,
|
|
|
|
|
|
)
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# Pick next probe (resolve name at call time for testability)
|
|
|
|
|
|
import timmy.loop_qa as _self_module
|
|
|
|
|
|
|
|
|
|
|
|
cap, probe_name = TEST_SEQUENCE[self._test_index]
|
|
|
|
|
|
probe_fn = getattr(_self_module, probe_name)
|
|
|
|
|
|
self._test_index = (self._test_index + 1) % len(TEST_SEQUENCE)
|
|
|
|
|
|
self._hourly_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
# Run probe
|
|
|
|
|
|
try:
|
|
|
|
|
|
result = await probe_fn()
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
# Probe itself crashed — record failure and report
|
2026-03-21 03:50:26 +00:00
|
|
|
|
logger.exception("Loop QA probe %s crashed", cap.value)
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
capture_error(exc, source="loop_qa", context={"capability": cap.value})
|
|
|
|
|
|
result = {
|
|
|
|
|
|
"success": False,
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"details": f"Probe crashed: {exc!s}"[:200],
|
|
|
|
|
|
"error_type": type(exc).__name__,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
cleanup: delete dead modules — ~7,900 lines removed
Closes #22, Closes #23
Deleted: brain/, swarm/, openfang/, paperclip/, cascade_adapter,
memory_migrate, agents/timmy.py, dead routes + all corresponding tests.
Updated pyproject.toml, app.py, loop_qa.py for removed imports.
2026-03-14 09:49:24 -04:00
|
|
|
|
# Log result
|
|
|
|
|
|
event_type = "loop_qa_ok" if result["success"] else "loop_qa_fail"
|
feat: add Loop QA self-testing framework
Structured self-test framework that probes 6 capabilities (tool use,
multistep planning, memory read/write, self-coding, lightning econ) in
round-robin. Reuses existing infra: event_log for persistence,
create_task() for upgrade proposals, capture_error() for crash handling,
and in-memory circuit breaker for failure tracking.
- src/timmy/loop_qa.py: Capability enum, 6 async probes, orchestrator
- src/dashboard/routes/loop_qa.py: JSON + HTMX health endpoints
- HTMX partial polls every 30s on the health panel
- Background scheduler in app.py lifespan
- 25 tests covering probes, orchestrator, health snapshot, routes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 22:33:16 -04:00
|
|
|
|
log_event(
|
|
|
|
|
|
event_type,
|
|
|
|
|
|
source="loop_qa",
|
|
|
|
|
|
data={
|
|
|
|
|
|
"capability": cap.value,
|
|
|
|
|
|
"details": result.get("details", ""),
|
|
|
|
|
|
"error_type": result.get("error_type"),
|
|
|
|
|
|
},
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# Update failure counter
|
|
|
|
|
|
if result["success"]:
|
|
|
|
|
|
self._failure_counts[cap] = 0
|
|
|
|
|
|
self._last_failed[cap] = None
|
|
|
|
|
|
self._proposal_filed.discard(cap)
|
|
|
|
|
|
else:
|
|
|
|
|
|
self._failure_counts[cap] += 1
|
|
|
|
|
|
self._last_failed[cap] = now.isoformat()
|
|
|
|
|
|
self._maybe_file_upgrade(cap)
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def _maybe_file_upgrade(self, cap: Capability) -> None:
|
|
|
|
|
|
"""File an upgrade task if threshold is reached and not already filed."""
|
|
|
|
|
|
count = self._failure_counts[cap]
|
|
|
|
|
|
if count < settings.loop_qa_upgrade_threshold:
|
|
|
|
|
|
return
|
|
|
|
|
|
if cap in self._proposal_filed:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
title = f"Stabilize {cap.value.upper()}: self-test failing {count}x in a row"
|
|
|
|
|
|
description = (
|
|
|
|
|
|
f"Loop QA detected {count} consecutive failures "
|
|
|
|
|
|
f"for capability '{cap.value}'.\n\n"
|
|
|
|
|
|
f"Last failure: {self._last_failed[cap]}\n"
|
|
|
|
|
|
f"Action: investigate root cause and restore capability."
|
|
|
|
|
|
)
|
|
|
|
|
|
create_task(
|
|
|
|
|
|
title=title,
|
|
|
|
|
|
description=description,
|
|
|
|
|
|
priority="high",
|
|
|
|
|
|
created_by="timmy_loop_qa",
|
|
|
|
|
|
task_type="loop_qa_upgrade",
|
|
|
|
|
|
)
|
|
|
|
|
|
self._proposal_filed.add(cap)
|
|
|
|
|
|
logger.info("Filed upgrade proposal for %s: %s", cap.value, title)
|
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
|
logger.warning("Failed to file upgrade proposal: %s", exc)
|
|
|
|
|
|
|
|
|
|
|
|
def get_health_snapshot(self) -> dict:
|
|
|
|
|
|
"""Build a health snapshot from in-memory failure counters."""
|
|
|
|
|
|
capabilities = []
|
|
|
|
|
|
for cap in Capability:
|
|
|
|
|
|
count = self._failure_counts.get(cap, 0)
|
|
|
|
|
|
capabilities.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"capability": cap,
|
|
|
|
|
|
"status": self.status_for_failures(count),
|
|
|
|
|
|
"last_failed_at": self._last_failed.get(cap),
|
|
|
|
|
|
"consecutive_failures": count,
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
statuses = [c["status"] for c in capabilities]
|
|
|
|
|
|
if "red" in statuses:
|
|
|
|
|
|
overall = "red"
|
|
|
|
|
|
elif "yellow" in statuses:
|
|
|
|
|
|
overall = "yellow"
|
|
|
|
|
|
else:
|
|
|
|
|
|
overall = "green"
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
"generated_at": datetime.now(UTC).isoformat(),
|
|
|
|
|
|
"overall_status": overall,
|
|
|
|
|
|
"capabilities": capabilities,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def status_for_failures(count: int) -> str:
|
|
|
|
|
|
"""Map consecutive failure count to green/yellow/red."""
|
|
|
|
|
|
if count >= settings.loop_qa_upgrade_threshold:
|
|
|
|
|
|
return "red"
|
|
|
|
|
|
elif count >= 2:
|
|
|
|
|
|
return "yellow"
|
|
|
|
|
|
return "green"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ── Module singleton ─────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
loop_qa_orchestrator = LoopQAOrchestrator()
|