feat: agent self-correction dashboard (#1007)
Implements the Agent Self-Correction Dashboard from issue #1007: - infrastructure/self_correction.py: SQLite-backed event logger. Records original_intent, detected_error, correction_strategy, and final_outcome with outcome_status (success/partial/failed) and error_type for pattern analysis. - timmy/agentic_loop.py: Hooks _handle_step_failure to emit a self-correction event whenever the loop adapts or fails to adapt a step. Uses best-effort logging (never raises). - dashboard/routes/self_correction.py: Three endpoints: GET /self-correction/ui - full dashboard GET /self-correction/timeline - HTMX partial (refreshes 30 s) GET /self-correction/patterns - HTMX partial (refreshes 60 s) - templates/self_correction.html + two partials: timeline and recurring-pattern table. Stats panel shows correction rate as a progress bar. - base.html: "SELF-CORRECT" link added to INTEL dropdown (desktop and mobile nav). - mission-control.css: new .sc-* component styles (no inline CSS). - tests/unit/test_self_correction.py: 18 unit tests covering log, get_corrections, get_patterns, get_stats. All 435 existing tests continue to pass. Fixes #1007 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -54,6 +54,7 @@ from dashboard.routes.system import router as system_router
|
||||
from dashboard.routes.tasks import router as tasks_router
|
||||
from dashboard.routes.telegram import router as telegram_router
|
||||
from dashboard.routes.thinking import router as thinking_router
|
||||
from dashboard.routes.self_correction import router as self_correction_router
|
||||
from dashboard.routes.three_strike import router as three_strike_router
|
||||
from dashboard.routes.tools import router as tools_router
|
||||
from dashboard.routes.tower import router as tower_router
|
||||
@@ -678,6 +679,7 @@ app.include_router(scorecards_router)
|
||||
app.include_router(sovereignty_metrics_router)
|
||||
app.include_router(sovereignty_ws_router)
|
||||
app.include_router(three_strike_router)
|
||||
app.include_router(self_correction_router)
|
||||
|
||||
|
||||
@app.websocket("/ws")
|
||||
|
||||
58
src/dashboard/routes/self_correction.py
Normal file
58
src/dashboard/routes/self_correction.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Self-Correction Dashboard routes.
|
||||
|
||||
GET /self-correction/ui — HTML dashboard
|
||||
GET /self-correction/timeline — HTMX partial: recent event timeline
|
||||
GET /self-correction/patterns — HTMX partial: recurring failure patterns
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
|
||||
from dashboard.templating import templates
|
||||
from infrastructure.self_correction import get_corrections, get_patterns, get_stats
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/self-correction", tags=["self-correction"])
|
||||
|
||||
|
||||
@router.get("/ui", response_class=HTMLResponse)
|
||||
async def self_correction_ui(request: Request):
|
||||
"""Render the Self-Correction Dashboard."""
|
||||
stats = get_stats()
|
||||
corrections = get_corrections(limit=20)
|
||||
patterns = get_patterns(top_n=10)
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"self_correction.html",
|
||||
{
|
||||
"stats": stats,
|
||||
"corrections": corrections,
|
||||
"patterns": patterns,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/timeline", response_class=HTMLResponse)
|
||||
async def self_correction_timeline(request: Request):
|
||||
"""HTMX partial: recent self-correction event timeline."""
|
||||
corrections = get_corrections(limit=30)
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"partials/self_correction_timeline.html",
|
||||
{"corrections": corrections},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/patterns", response_class=HTMLResponse)
|
||||
async def self_correction_patterns(request: Request):
|
||||
"""HTMX partial: recurring failure patterns."""
|
||||
patterns = get_patterns(top_n=10)
|
||||
stats = get_stats()
|
||||
return templates.TemplateResponse(
|
||||
request,
|
||||
"partials/self_correction_patterns.html",
|
||||
{"patterns": patterns, "stats": stats},
|
||||
)
|
||||
@@ -71,6 +71,7 @@
|
||||
<a href="/spark/ui" class="mc-test-link">SPARK</a>
|
||||
<a href="/memory" class="mc-test-link">MEMORY</a>
|
||||
<a href="/marketplace/ui" class="mc-test-link">MARKET</a>
|
||||
<a href="/self-correction/ui" class="mc-test-link">SELF-CORRECT</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="mc-nav-dropdown">
|
||||
@@ -132,6 +133,7 @@
|
||||
<a href="/spark/ui" class="mc-mobile-link">SPARK</a>
|
||||
<a href="/memory" class="mc-mobile-link">MEMORY</a>
|
||||
<a href="/marketplace/ui" class="mc-mobile-link">MARKET</a>
|
||||
<a href="/self-correction/ui" class="mc-mobile-link">SELF-CORRECT</a>
|
||||
<div class="mc-mobile-section-label">AGENTS</div>
|
||||
<a href="/hands" class="mc-mobile-link">HANDS</a>
|
||||
<a href="/work-orders/queue" class="mc-mobile-link">WORK ORDERS</a>
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
{% if patterns %}
|
||||
<table class="mc-table w-100">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ERROR TYPE</th>
|
||||
<th class="text-center">COUNT</th>
|
||||
<th class="text-center">CORRECTED</th>
|
||||
<th class="text-center">FAILED</th>
|
||||
<th>LAST SEEN</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for p in patterns %}
|
||||
<tr>
|
||||
<td class="sc-pattern-type">{{ p.error_type }}</td>
|
||||
<td class="text-center">
|
||||
<span class="badge {% if p.count >= 5 %}badge-error{% elif p.count >= 3 %}badge-warning{% else %}badge-info{% endif %}">{{ p.count }}</span>
|
||||
</td>
|
||||
<td class="text-center text-success">{{ p.success_count }}</td>
|
||||
<td class="text-center {% if p.failed_count > 0 %}text-danger{% else %}text-muted{% endif %}">{{ p.failed_count }}</td>
|
||||
<td class="sc-event-time">{{ p.last_seen[:16] if p.last_seen else '—' }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<div class="text-center text-muted py-3">No patterns detected yet.</div>
|
||||
{% endif %}
|
||||
@@ -0,0 +1,26 @@
|
||||
{% if corrections %}
|
||||
{% for ev in corrections %}
|
||||
<div class="sc-event sc-status-{{ ev.outcome_status }}">
|
||||
<div class="sc-event-header">
|
||||
<span class="sc-status-badge sc-status-{{ ev.outcome_status }}">
|
||||
{% if ev.outcome_status == 'success' %}✓ CORRECTED
|
||||
{% elif ev.outcome_status == 'partial' %}● PARTIAL
|
||||
{% else %}✗ FAILED
|
||||
{% endif %}
|
||||
</span>
|
||||
<span class="sc-source-badge">{{ ev.source }}</span>
|
||||
<span class="sc-event-time">{{ ev.created_at[:19] }}</span>
|
||||
</div>
|
||||
<div class="sc-event-error-type">{{ ev.error_type }}</div>
|
||||
<div class="sc-event-intent"><span class="sc-label">INTENT:</span> {{ ev.original_intent[:120] }}{% if ev.original_intent | length > 120 %}…{% endif %}</div>
|
||||
<div class="sc-event-error"><span class="sc-label">ERROR:</span> {{ ev.detected_error[:120] }}{% if ev.detected_error | length > 120 %}…{% endif %}</div>
|
||||
<div class="sc-event-strategy"><span class="sc-label">STRATEGY:</span> {{ ev.correction_strategy[:120] }}{% if ev.correction_strategy | length > 120 %}…{% endif %}</div>
|
||||
<div class="sc-event-outcome"><span class="sc-label">OUTCOME:</span> {{ ev.final_outcome[:120] }}{% if ev.final_outcome | length > 120 %}…{% endif %}</div>
|
||||
{% if ev.task_id %}
|
||||
<div class="sc-event-meta">task: {{ ev.task_id[:8] }}</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="text-center text-muted py-3">No self-correction events recorded yet.</div>
|
||||
{% endif %}
|
||||
102
src/dashboard/templates/self_correction.html
Normal file
102
src/dashboard/templates/self_correction.html
Normal file
@@ -0,0 +1,102 @@
|
||||
{% extends "base.html" %}
|
||||
{% from "macros.html" import panel %}
|
||||
|
||||
{% block title %}Timmy Time — Self-Correction Dashboard{% endblock %}
|
||||
|
||||
{% block extra_styles %}{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container-fluid py-3">
|
||||
|
||||
<!-- Header -->
|
||||
<div class="spark-header mb-3">
|
||||
<div class="spark-title">SELF-CORRECTION</div>
|
||||
<div class="spark-subtitle">
|
||||
Agent error detection & recovery —
|
||||
<span class="spark-status-val">{{ stats.total }}</span> events,
|
||||
<span class="spark-status-val">{{ stats.success_rate }}%</span> correction rate,
|
||||
<span class="spark-status-val">{{ stats.unique_error_types }}</span> distinct error types
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row g-3">
|
||||
|
||||
<!-- Left column: stats + patterns -->
|
||||
<div class="col-12 col-lg-4 d-flex flex-column gap-3">
|
||||
|
||||
<!-- Stats panel -->
|
||||
<div class="card mc-panel">
|
||||
<div class="card-header mc-panel-header">// CORRECTION STATS</div>
|
||||
<div class="card-body p-3">
|
||||
<div class="spark-stat-grid">
|
||||
<div class="spark-stat">
|
||||
<span class="spark-stat-label">TOTAL</span>
|
||||
<span class="spark-stat-value">{{ stats.total }}</span>
|
||||
</div>
|
||||
<div class="spark-stat">
|
||||
<span class="spark-stat-label">CORRECTED</span>
|
||||
<span class="spark-stat-value text-success">{{ stats.success_count }}</span>
|
||||
</div>
|
||||
<div class="spark-stat">
|
||||
<span class="spark-stat-label">PARTIAL</span>
|
||||
<span class="spark-stat-value text-warning">{{ stats.partial_count }}</span>
|
||||
</div>
|
||||
<div class="spark-stat">
|
||||
<span class="spark-stat-label">FAILED</span>
|
||||
<span class="spark-stat-value {% if stats.failed_count > 0 %}text-danger{% else %}text-muted{% endif %}">{{ stats.failed_count }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="mt-3">
|
||||
<div class="d-flex justify-content-between mb-1">
|
||||
<small class="text-muted">Correction Rate</small>
|
||||
<small class="{% if stats.success_rate >= 70 %}text-success{% elif stats.success_rate >= 40 %}text-warning{% else %}text-danger{% endif %}">{{ stats.success_rate }}%</small>
|
||||
</div>
|
||||
<div class="progress" style="height:6px;">
|
||||
<div class="progress-bar {% if stats.success_rate >= 70 %}bg-success{% elif stats.success_rate >= 40 %}bg-warning{% else %}bg-danger{% endif %}"
|
||||
role="progressbar"
|
||||
style="width:{{ stats.success_rate }}%"
|
||||
aria-valuenow="{{ stats.success_rate }}"
|
||||
aria-valuemin="0"
|
||||
aria-valuemax="100"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Patterns panel -->
|
||||
<div class="card mc-panel"
|
||||
hx-get="/self-correction/patterns"
|
||||
hx-trigger="load, every 60s"
|
||||
hx-target="#sc-patterns-body"
|
||||
hx-swap="innerHTML">
|
||||
<div class="card-header mc-panel-header d-flex justify-content-between align-items-center">
|
||||
<span>// RECURRING PATTERNS</span>
|
||||
<span class="badge badge-info">{{ patterns | length }}</span>
|
||||
</div>
|
||||
<div class="card-body p-0" id="sc-patterns-body">
|
||||
{% include "partials/self_correction_patterns.html" %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- Right column: timeline -->
|
||||
<div class="col-12 col-lg-8">
|
||||
<div class="card mc-panel"
|
||||
hx-get="/self-correction/timeline"
|
||||
hx-trigger="load, every 30s"
|
||||
hx-target="#sc-timeline-body"
|
||||
hx-swap="innerHTML">
|
||||
<div class="card-header mc-panel-header d-flex justify-content-between align-items-center">
|
||||
<span>// CORRECTION TIMELINE</span>
|
||||
<span class="badge badge-info">{{ corrections | length }}</span>
|
||||
</div>
|
||||
<div class="card-body p-3" id="sc-timeline-body">
|
||||
{% include "partials/self_correction_timeline.html" %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
247
src/infrastructure/self_correction.py
Normal file
247
src/infrastructure/self_correction.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""Self-correction event logger.
|
||||
|
||||
Records instances where the agent detected its own errors and the steps
|
||||
it took to correct them. Used by the Self-Correction Dashboard to visualise
|
||||
these events and surface recurring failure patterns.
|
||||
|
||||
Usage::
|
||||
|
||||
from infrastructure.self_correction import log_self_correction, get_corrections, get_patterns
|
||||
|
||||
log_self_correction(
|
||||
source="agentic_loop",
|
||||
original_intent="Execute step 3: deploy service",
|
||||
detected_error="ConnectionRefusedError: port 8080 unavailable",
|
||||
correction_strategy="Retry on alternate port 8081",
|
||||
final_outcome="Success on retry",
|
||||
task_id="abc123",
|
||||
)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import uuid
|
||||
from collections.abc import Generator
|
||||
from contextlib import closing, contextmanager
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Database
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DB_PATH: Path | None = None
|
||||
|
||||
|
||||
def _get_db_path() -> Path:
|
||||
global _DB_PATH
|
||||
if _DB_PATH is None:
|
||||
from config import settings
|
||||
|
||||
_DB_PATH = Path(settings.repo_root) / "data" / "self_correction.db"
|
||||
return _DB_PATH
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _get_db() -> Generator[sqlite3.Connection, None, None]:
|
||||
db_path = _get_db_path()
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with closing(sqlite3.connect(str(db_path))) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS self_correction_events (
|
||||
id TEXT PRIMARY KEY,
|
||||
source TEXT NOT NULL,
|
||||
task_id TEXT DEFAULT '',
|
||||
original_intent TEXT NOT NULL,
|
||||
detected_error TEXT NOT NULL,
|
||||
correction_strategy TEXT NOT NULL,
|
||||
final_outcome TEXT NOT NULL,
|
||||
outcome_status TEXT DEFAULT 'success',
|
||||
error_type TEXT DEFAULT '',
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
)
|
||||
""")
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_sc_created ON self_correction_events(created_at)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_sc_error_type ON self_correction_events(error_type)"
|
||||
)
|
||||
conn.commit()
|
||||
yield conn
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Write
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def log_self_correction(
|
||||
*,
|
||||
source: str,
|
||||
original_intent: str,
|
||||
detected_error: str,
|
||||
correction_strategy: str,
|
||||
final_outcome: str,
|
||||
task_id: str = "",
|
||||
outcome_status: str = "success",
|
||||
error_type: str = "",
|
||||
) -> str:
|
||||
"""Record a self-correction event and return its ID.
|
||||
|
||||
Args:
|
||||
source: Module or component that triggered the correction.
|
||||
original_intent: What the agent was trying to do.
|
||||
detected_error: The error or problem that was detected.
|
||||
correction_strategy: How the agent attempted to correct the error.
|
||||
final_outcome: What the result of the correction attempt was.
|
||||
task_id: Optional task/session ID for correlation.
|
||||
outcome_status: 'success', 'partial', or 'failed'.
|
||||
error_type: Short category label for pattern analysis (e.g.
|
||||
'ConnectionError', 'TimeoutError').
|
||||
|
||||
Returns:
|
||||
The ID of the newly created record.
|
||||
"""
|
||||
event_id = str(uuid.uuid4())
|
||||
if not error_type:
|
||||
# Derive a simple type from the first word of the detected error
|
||||
error_type = detected_error.split(":")[0].strip()[:64]
|
||||
|
||||
try:
|
||||
with _get_db() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO self_correction_events
|
||||
(id, source, task_id, original_intent, detected_error,
|
||||
correction_strategy, final_outcome, outcome_status, error_type)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
event_id,
|
||||
source,
|
||||
task_id,
|
||||
original_intent[:2000],
|
||||
detected_error[:2000],
|
||||
correction_strategy[:2000],
|
||||
final_outcome[:2000],
|
||||
outcome_status,
|
||||
error_type,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
logger.info(
|
||||
"Self-correction logged [%s] source=%s error_type=%s status=%s",
|
||||
event_id[:8],
|
||||
source,
|
||||
error_type,
|
||||
outcome_status,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to log self-correction event: %s", exc)
|
||||
|
||||
return event_id
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Read
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_corrections(limit: int = 50) -> list[dict]:
|
||||
"""Return the most recent self-correction events, newest first."""
|
||||
try:
|
||||
with _get_db() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT * FROM self_correction_events
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch self-correction events: %s", exc)
|
||||
return []
|
||||
|
||||
|
||||
def get_patterns(top_n: int = 10) -> list[dict]:
|
||||
"""Return the most common recurring error types with counts.
|
||||
|
||||
Each entry has:
|
||||
- error_type: category label
|
||||
- count: total occurrences
|
||||
- success_count: corrected successfully
|
||||
- failed_count: correction also failed
|
||||
- last_seen: ISO timestamp of most recent occurrence
|
||||
"""
|
||||
try:
|
||||
with _get_db() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
error_type,
|
||||
COUNT(*) AS count,
|
||||
SUM(CASE WHEN outcome_status = 'success' THEN 1 ELSE 0 END) AS success_count,
|
||||
SUM(CASE WHEN outcome_status = 'failed' THEN 1 ELSE 0 END) AS failed_count,
|
||||
MAX(created_at) AS last_seen
|
||||
FROM self_correction_events
|
||||
GROUP BY error_type
|
||||
ORDER BY count DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(top_n,),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch self-correction patterns: %s", exc)
|
||||
return []
|
||||
|
||||
|
||||
def get_stats() -> dict:
|
||||
"""Return aggregate statistics for the summary panel."""
|
||||
try:
|
||||
with _get_db() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
SUM(CASE WHEN outcome_status = 'success' THEN 1 ELSE 0 END) AS success_count,
|
||||
SUM(CASE WHEN outcome_status = 'partial' THEN 1 ELSE 0 END) AS partial_count,
|
||||
SUM(CASE WHEN outcome_status = 'failed' THEN 1 ELSE 0 END) AS failed_count,
|
||||
COUNT(DISTINCT error_type) AS unique_error_types,
|
||||
COUNT(DISTINCT source) AS sources
|
||||
FROM self_correction_events
|
||||
"""
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return _empty_stats()
|
||||
d = dict(row)
|
||||
total = d.get("total") or 0
|
||||
if total:
|
||||
d["success_rate"] = round((d.get("success_count") or 0) / total * 100)
|
||||
else:
|
||||
d["success_rate"] = 0
|
||||
return d
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch self-correction stats: %s", exc)
|
||||
return _empty_stats()
|
||||
|
||||
|
||||
def _empty_stats() -> dict:
|
||||
return {
|
||||
"total": 0,
|
||||
"success_count": 0,
|
||||
"partial_count": 0,
|
||||
"failed_count": 0,
|
||||
"unique_error_types": 0,
|
||||
"sources": 0,
|
||||
"success_rate": 0,
|
||||
}
|
||||
@@ -312,6 +312,13 @@ async def _handle_step_failure(
|
||||
"adaptation": step.result[:200],
|
||||
},
|
||||
)
|
||||
_log_self_correction(
|
||||
task_id=task_id,
|
||||
step_desc=step_desc,
|
||||
exc=exc,
|
||||
outcome=step.result,
|
||||
outcome_status="success",
|
||||
)
|
||||
if on_progress:
|
||||
await on_progress(f"[Adapted] {step_desc}", step_num, total_steps)
|
||||
except Exception as adapt_exc: # broad catch intentional
|
||||
@@ -325,9 +332,42 @@ async def _handle_step_failure(
|
||||
duration_ms=int((time.monotonic() - step_start) * 1000),
|
||||
)
|
||||
)
|
||||
_log_self_correction(
|
||||
task_id=task_id,
|
||||
step_desc=step_desc,
|
||||
exc=exc,
|
||||
outcome=f"Adaptation also failed: {adapt_exc}",
|
||||
outcome_status="failed",
|
||||
)
|
||||
completed_results.append(f"Step {step_num}: FAILED")
|
||||
|
||||
|
||||
def _log_self_correction(
|
||||
*,
|
||||
task_id: str,
|
||||
step_desc: str,
|
||||
exc: Exception,
|
||||
outcome: str,
|
||||
outcome_status: str,
|
||||
) -> None:
|
||||
"""Best-effort: log a self-correction event (never raises)."""
|
||||
try:
|
||||
from infrastructure.self_correction import log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="agentic_loop",
|
||||
original_intent=step_desc,
|
||||
detected_error=f"{type(exc).__name__}: {exc}",
|
||||
correction_strategy="Adaptive re-plan via LLM",
|
||||
final_outcome=outcome[:500],
|
||||
task_id=task_id,
|
||||
outcome_status=outcome_status,
|
||||
error_type=type(exc).__name__,
|
||||
)
|
||||
except Exception as log_exc:
|
||||
logger.debug("Self-correction log failed: %s", log_exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -2714,3 +2714,74 @@
|
||||
padding: 0.3rem 0.6rem;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
/* ── Self-Correction Dashboard ─────────────────────────────── */
|
||||
.sc-event {
|
||||
border-left: 3px solid var(--border);
|
||||
padding: 0.6rem 0.8rem;
|
||||
margin-bottom: 0.75rem;
|
||||
background: rgba(255,255,255,0.02);
|
||||
border-radius: 0 4px 4px 0;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
.sc-event.sc-status-success { border-left-color: var(--green); }
|
||||
.sc-event.sc-status-partial { border-left-color: var(--amber); }
|
||||
.sc-event.sc-status-failed { border-left-color: var(--red); }
|
||||
|
||||
.sc-event-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.4rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.sc-status-badge {
|
||||
font-size: 0.68rem;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0.06em;
|
||||
padding: 0.15rem 0.45rem;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.sc-status-badge.sc-status-success { color: var(--green); background: rgba(0,255,136,0.08); }
|
||||
.sc-status-badge.sc-status-partial { color: var(--amber); background: rgba(255,179,0,0.08); }
|
||||
.sc-status-badge.sc-status-failed { color: var(--red); background: rgba(255,59,59,0.08); }
|
||||
|
||||
.sc-source-badge {
|
||||
font-size: 0.68rem;
|
||||
color: var(--purple);
|
||||
background: rgba(168,85,247,0.1);
|
||||
padding: 0.1rem 0.4rem;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.sc-event-time { font-size: 0.68rem; color: var(--text-dim); margin-left: auto; }
|
||||
.sc-event-error-type {
|
||||
font-size: 0.72rem;
|
||||
color: var(--amber);
|
||||
font-weight: 600;
|
||||
margin-bottom: 0.3rem;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
.sc-label {
|
||||
font-size: 0.65rem;
|
||||
font-weight: 700;
|
||||
letter-spacing: 0.06em;
|
||||
color: var(--text-dim);
|
||||
margin-right: 0.3rem;
|
||||
}
|
||||
.sc-event-intent, .sc-event-error, .sc-event-strategy, .sc-event-outcome {
|
||||
color: var(--text);
|
||||
margin-bottom: 0.2rem;
|
||||
line-height: 1.4;
|
||||
word-break: break-word;
|
||||
}
|
||||
.sc-event-error { color: var(--red); }
|
||||
.sc-event-strategy { color: var(--text-dim); font-style: italic; }
|
||||
.sc-event-outcome { color: var(--text-bright); }
|
||||
.sc-event-meta { font-size: 0.68rem; color: var(--text-dim); margin-top: 0.3rem; }
|
||||
|
||||
.sc-pattern-type {
|
||||
font-family: var(--font);
|
||||
font-size: 0.8rem;
|
||||
color: var(--text-bright);
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
269
tests/unit/test_self_correction.py
Normal file
269
tests/unit/test_self_correction.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""Unit tests for infrastructure.self_correction."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _isolated_db(tmp_path, monkeypatch):
|
||||
"""Point the self-correction module at a fresh temp database per test."""
|
||||
import infrastructure.self_correction as sc_mod
|
||||
|
||||
# Reset the cached path so each test gets a clean DB
|
||||
sc_mod._DB_PATH = tmp_path / "self_correction.db"
|
||||
yield
|
||||
sc_mod._DB_PATH = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# log_self_correction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestLogSelfCorrection:
|
||||
def test_returns_event_id(self):
|
||||
from infrastructure.self_correction import log_self_correction
|
||||
|
||||
eid = log_self_correction(
|
||||
source="test",
|
||||
original_intent="Do X",
|
||||
detected_error="ValueError: bad input",
|
||||
correction_strategy="Try Y instead",
|
||||
final_outcome="Y succeeded",
|
||||
)
|
||||
assert isinstance(eid, str)
|
||||
assert len(eid) == 36 # UUID format
|
||||
|
||||
def test_derives_error_type_from_error_string(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="Connect",
|
||||
detected_error="ConnectionRefusedError: port 80",
|
||||
correction_strategy="Use port 8080",
|
||||
final_outcome="ok",
|
||||
)
|
||||
rows = get_corrections(limit=1)
|
||||
assert rows[0]["error_type"] == "ConnectionRefusedError"
|
||||
|
||||
def test_explicit_error_type_preserved(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="Run task",
|
||||
detected_error="Some weird error",
|
||||
correction_strategy="Fix it",
|
||||
final_outcome="done",
|
||||
error_type="CustomError",
|
||||
)
|
||||
rows = get_corrections(limit=1)
|
||||
assert rows[0]["error_type"] == "CustomError"
|
||||
|
||||
def test_task_id_stored(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="intent",
|
||||
detected_error="err",
|
||||
correction_strategy="strat",
|
||||
final_outcome="outcome",
|
||||
task_id="task-abc-123",
|
||||
)
|
||||
rows = get_corrections(limit=1)
|
||||
assert rows[0]["task_id"] == "task-abc-123"
|
||||
|
||||
def test_outcome_status_stored(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="i",
|
||||
detected_error="e",
|
||||
correction_strategy="s",
|
||||
final_outcome="o",
|
||||
outcome_status="failed",
|
||||
)
|
||||
rows = get_corrections(limit=1)
|
||||
assert rows[0]["outcome_status"] == "failed"
|
||||
|
||||
def test_long_strings_truncated(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
long = "x" * 3000
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent=long,
|
||||
detected_error=long,
|
||||
correction_strategy=long,
|
||||
final_outcome=long,
|
||||
)
|
||||
rows = get_corrections(limit=1)
|
||||
assert len(rows[0]["original_intent"]) <= 2000
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_corrections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGetCorrections:
|
||||
def test_empty_db_returns_empty_list(self):
|
||||
from infrastructure.self_correction import get_corrections
|
||||
|
||||
assert get_corrections() == []
|
||||
|
||||
def test_returns_newest_first(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
for i in range(3):
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent=f"intent {i}",
|
||||
detected_error="err",
|
||||
correction_strategy="fix",
|
||||
final_outcome="done",
|
||||
error_type=f"Type{i}",
|
||||
)
|
||||
rows = get_corrections(limit=10)
|
||||
assert len(rows) == 3
|
||||
# Newest first — Type2 should appear before Type0
|
||||
types = [r["error_type"] for r in rows]
|
||||
assert types.index("Type2") < types.index("Type0")
|
||||
|
||||
def test_limit_respected(self):
|
||||
from infrastructure.self_correction import get_corrections, log_self_correction
|
||||
|
||||
for _ in range(5):
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="i",
|
||||
detected_error="e",
|
||||
correction_strategy="s",
|
||||
final_outcome="o",
|
||||
)
|
||||
rows = get_corrections(limit=3)
|
||||
assert len(rows) == 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGetPatterns:
|
||||
def test_empty_db_returns_empty_list(self):
|
||||
from infrastructure.self_correction import get_patterns
|
||||
|
||||
assert get_patterns() == []
|
||||
|
||||
def test_counts_by_error_type(self):
|
||||
from infrastructure.self_correction import get_patterns, log_self_correction
|
||||
|
||||
for _ in range(3):
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="i",
|
||||
detected_error="e",
|
||||
correction_strategy="s",
|
||||
final_outcome="o",
|
||||
error_type="TimeoutError",
|
||||
)
|
||||
log_self_correction(
|
||||
source="test",
|
||||
original_intent="i",
|
||||
detected_error="e",
|
||||
correction_strategy="s",
|
||||
final_outcome="o",
|
||||
error_type="ValueError",
|
||||
)
|
||||
patterns = get_patterns(top_n=10)
|
||||
by_type = {p["error_type"]: p for p in patterns}
|
||||
assert by_type["TimeoutError"]["count"] == 3
|
||||
assert by_type["ValueError"]["count"] == 1
|
||||
|
||||
def test_success_vs_failed_counts(self):
|
||||
from infrastructure.self_correction import get_patterns, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="test", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o",
|
||||
error_type="Foo", outcome_status="success",
|
||||
)
|
||||
log_self_correction(
|
||||
source="test", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o",
|
||||
error_type="Foo", outcome_status="failed",
|
||||
)
|
||||
patterns = get_patterns(top_n=5)
|
||||
foo = next(p for p in patterns if p["error_type"] == "Foo")
|
||||
assert foo["success_count"] == 1
|
||||
assert foo["failed_count"] == 1
|
||||
|
||||
def test_ordered_by_count_desc(self):
|
||||
from infrastructure.self_correction import get_patterns, log_self_correction
|
||||
|
||||
for _ in range(2):
|
||||
log_self_correction(
|
||||
source="t", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o", error_type="Rare",
|
||||
)
|
||||
for _ in range(5):
|
||||
log_self_correction(
|
||||
source="t", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o", error_type="Common",
|
||||
)
|
||||
patterns = get_patterns(top_n=5)
|
||||
assert patterns[0]["error_type"] == "Common"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_stats
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGetStats:
|
||||
def test_empty_db_returns_zeroes(self):
|
||||
from infrastructure.self_correction import get_stats
|
||||
|
||||
stats = get_stats()
|
||||
assert stats["total"] == 0
|
||||
assert stats["success_rate"] == 0
|
||||
|
||||
def test_counts_outcomes(self):
|
||||
from infrastructure.self_correction import get_stats, log_self_correction
|
||||
|
||||
log_self_correction(
|
||||
source="t", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o", outcome_status="success",
|
||||
)
|
||||
log_self_correction(
|
||||
source="t", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o", outcome_status="failed",
|
||||
)
|
||||
stats = get_stats()
|
||||
assert stats["total"] == 2
|
||||
assert stats["success_count"] == 1
|
||||
assert stats["failed_count"] == 1
|
||||
assert stats["success_rate"] == 50
|
||||
|
||||
def test_success_rate_100_when_all_succeed(self):
|
||||
from infrastructure.self_correction import get_stats, log_self_correction
|
||||
|
||||
for _ in range(4):
|
||||
log_self_correction(
|
||||
source="t", original_intent="i", detected_error="e",
|
||||
correction_strategy="s", final_outcome="o", outcome_status="success",
|
||||
)
|
||||
stats = get_stats()
|
||||
assert stats["success_rate"] == 100
|
||||
Reference in New Issue
Block a user