fix: add self_reflect tool for past behavior review (#417)

Co-authored-by: Kimi Agent <kimi@timmy.local> Co-committed-by: Kimi Agent <kimi@timmy.local>
2026-03-19 09:39:14 -04:00
parent 332fa373b8
commit 3afb62afb7
4 changed files with 306 additions and 5 deletions
--- a/src/timmy/session_logger.py
+++ b/src/timmy/session_logger.py
@@ -155,6 +155,34 @@ class SessionLogger:
            "decisions": sum(1 for e in entries if e.get("type") == "decision"),
        }

+    def get_recent_entries(self, limit: int = 50) -> list[dict]:
+        """Load recent entries across all session logs.
+
+        Args:
+            limit: Maximum number of entries to return.
+
+        Returns:
+            List of entries (most recent first).
+        """
+        entries: list[dict] = []
+        log_files = sorted(self.logs_dir.glob("session_*.jsonl"), reverse=True)
+        for log_file in log_files:
+            if len(entries) >= limit:
+                break
+            try:
+                with open(log_file) as f:
+                    lines = [ln for ln in f if ln.strip()]
+                for line in reversed(lines):
+                    if len(entries) >= limit:
+                        break
+                    try:
+                        entries.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        continue
+            except OSError:
+                continue
+        return entries
+
    def search(self, query: str, role: str | None = None, limit: int = 10) -> list[dict]:
        """Search across all session logs for entries matching a query.

@@ -287,3 +315,120 @@ def session_history(query: str, role: str = "", limit: int = 10) -> str:
            lines[-1] += f"  ({source})"

    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Confidence threshold used for flagging low-confidence responses
+# ---------------------------------------------------------------------------
+_LOW_CONFIDENCE_THRESHOLD = 0.5
+
+
+def self_reflect(limit: int = 30) -> str:
+    """Review recent conversations and reflect on Timmy's own behavior.
+
+    Scans past session entries for patterns: low-confidence responses,
+    errors, repeated topics, and conversation quality signals.  Returns
+    a structured reflection that Timmy can use to improve.
+
+    Args:
+        limit: How many recent entries to review (default 30).
+
+    Returns:
+        A formatted self-reflection report.
+    """
+    sl = get_session_logger()
+    sl.flush()
+    entries = sl.get_recent_entries(limit=limit)
+
+    if not entries:
+        return "No conversation history to reflect on yet."
+
+    # Categorize entries
+    messages = [e for e in entries if e.get("type") == "message"]
+    errors = [e for e in entries if e.get("type") == "error"]
+    timmy_msgs = [e for e in messages if e.get("role") == "timmy"]
+    user_msgs = [e for e in messages if e.get("role") == "user"]
+
+    # 1. Low-confidence responses
+    low_conf = [
+        m
+        for m in timmy_msgs
+        if m.get("confidence") is not None and m["confidence"] < _LOW_CONFIDENCE_THRESHOLD
+    ]
+
+    # 2. Identify repeated user topics (simple word frequency)
+    topic_counts: dict[str, int] = {}
+    for m in user_msgs:
+        for word in (m.get("content") or "").lower().split():
+            cleaned = word.strip(".,!?\"'()[]")
+            if len(cleaned) > 3:
+                topic_counts[cleaned] = topic_counts.get(cleaned, 0) + 1
+    repeated = sorted(
+        ((w, c) for w, c in topic_counts.items() if c >= 3),
+        key=lambda x: x[1],
+        reverse=True,
+    )[:5]
+
+    # Build reflection report
+    sections: list[str] = ["## Self-Reflection Report\n"]
+
+    sections.append(
+        f"Reviewed {len(entries)} recent entries: "
+        f"{len(user_msgs)} user messages, "
+        f"{len(timmy_msgs)} responses, "
+        f"{len(errors)} errors.\n"
+    )
+
+    # Low confidence
+    if low_conf:
+        sections.append(f"### Low-Confidence Responses ({len(low_conf)})")
+        for m in low_conf[:5]:
+            ts = (m.get("timestamp") or "?")[:19]
+            conf = m.get("confidence", 0)
+            text = (m.get("content") or "")[:120]
+            sections.append(f"- [{ts}] confidence={conf:.0%}: {text}")
+        sections.append("")
+    else:
+        sections.append(
+            "### Low-Confidence Responses\nNone found — all responses above threshold.\n"
+        )
+
+    # Errors
+    if errors:
+        sections.append(f"### Errors ({len(errors)})")
+        for e in errors[:5]:
+            ts = (e.get("timestamp") or "?")[:19]
+            err = (e.get("error") or "")[:120]
+            sections.append(f"- [{ts}] {err}")
+        sections.append("")
+    else:
+        sections.append("### Errors\nNo errors recorded.\n")
+
+    # Repeated topics
+    if repeated:
+        sections.append("### Recurring Topics")
+        for word, count in repeated:
+            sections.append(f'- "{word}" ({count} mentions)')
+        sections.append("")
+    else:
+        sections.append("### Recurring Topics\nNo strong patterns detected.\n")
+
+    # Actionable summary
+    insights: list[str] = []
+    if low_conf:
+        insights.append("Consider studying topics where confidence was low.")
+    if errors:
+        insights.append("Review error patterns for recurring infrastructure issues.")
+    if repeated:
+        top_topic = repeated[0][0]
+        insights.append(
+            f'User frequently asks about "{top_topic}" — consider deepening knowledge here.'
+        )
+    if not insights:
+        insights.append("Conversations look healthy. Keep up the good work.")
+
+    sections.append("### Insights")
+    for insight in insights:
+        sections.append(f"- {insight}")
+
+    return "\n".join(sections)
--- a/src/timmy/tools.py
+++ b/src/timmy/tools.py
@@ -594,9 +594,10 @@ def _register_introspection_tools(toolkit: Toolkit) -> None:
        logger.debug("update_gitea_avatar tool not available: %s", exc)

    try:
-        from timmy.session_logger import session_history
+        from timmy.session_logger import self_reflect, session_history

        toolkit.register(session_history, name="session_history")
+        toolkit.register(self_reflect, name="self_reflect")
    except (ImportError, AttributeError) as exc:
        logger.warning("Tool execution failed (session_history registration): %s", exc)
        logger.debug("session_history tool not available")
@@ -874,6 +875,11 @@ def _introspection_tool_catalog() -> dict:
            "description": "Query Timmy's own thought history for past reflections and insights",
            "available_in": ["orchestrator"],
        },
+        "self_reflect": {
+            "name": "Self-Reflect",
+            "description": "Review recent conversations to spot patterns, low-confidence answers, and errors",
+            "available_in": ["orchestrator"],
+        },
        "update_gitea_avatar": {
            "name": "Update Gitea Avatar",
            "description": "Generate and upload a wizard-themed avatar to Timmy's Gitea profile",
--- a/tests/dashboard/test_world_api.py
+++ b/tests/dashboard/test_world_api.py
@@ -1,5 +1,6 @@
 """Tests for GET /api/world/state endpoint and /api/world/ws relay."""

+import asyncio
 import json
 import logging
 import time
@@ -7,8 +8,6 @@ from unittest.mock import AsyncMock, MagicMock, patch

 import pytest

-import asyncio
-
 from dashboard.routes.world import (
    _GROUND_TTL,
    _REMIND_AFTER,
@@ -393,7 +392,6 @@ async def test_conversation_buffer_caps_at_max():

 def test_log_bark_failure_logs_exception(caplog):
    """_log_bark_failure logs errors from failed bark tasks."""
-    import asyncio

    loop = asyncio.new_event_loop()

@@ -410,7 +408,6 @@ def test_log_bark_failure_logs_exception(caplog):

 def test_log_bark_failure_ignores_cancelled():
    """_log_bark_failure silently ignores cancelled tasks."""
-    import asyncio

    task = MagicMock(spec=asyncio.Task)
    task.cancelled.return_value = True
--- a/tests/timmy/test_session_logger.py
+++ b/tests/timmy/test_session_logger.py
@@ -12,6 +12,7 @@ from timmy.session_logger import (
    flush_session_logs,
    get_session_logger,
    get_session_summary,
+    self_reflect,
    session_history,
 )

@@ -927,3 +928,155 @@ class TestSessionHistoryTool:

        result = session_history("calculator")
        assert "tool:calculator" in result
+
+
+class TestSelfReflect:
+    """Tests for the self_reflect() tool function."""
+
+    def _write_entries(self, logs_dir, entries):
+        """Helper: write entries to today's session file."""
+        path = logs_dir / f"session_{date.today().isoformat()}.jsonl"
+        with open(path, "w") as f:
+            for entry in entries:
+                f.write(json.dumps(entry) + "\n")
+
+    def test_self_reflect_no_history(self, tmp_path, mock_settings, monkeypatch):
+        """Should return a message when no history exists."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+
+        result = self_reflect()
+        assert "No conversation history" in result
+
+    def test_self_reflect_detects_low_confidence(self, tmp_path, mock_settings, monkeypatch):
+        """Should flag low-confidence responses."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+        self._write_entries(
+            tmp_path,
+            [
+                {
+                    "type": "message",
+                    "role": "timmy",
+                    "content": "I think maybe it could be X",
+                    "confidence": 0.3,
+                    "timestamp": "2026-03-01T10:00:00",
+                },
+                {
+                    "type": "message",
+                    "role": "timmy",
+                    "content": "The answer is Y",
+                    "confidence": 0.9,
+                    "timestamp": "2026-03-01T10:01:00",
+                },
+            ],
+        )
+
+        result = self_reflect()
+        assert "Low-Confidence Responses (1)" in result
+        assert "confidence=30%" in result
+
+    def test_self_reflect_detects_errors(self, tmp_path, mock_settings, monkeypatch):
+        """Should report errors in reflection."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+        self._write_entries(
+            tmp_path,
+            [
+                {
+                    "type": "error",
+                    "error": "Ollama connection refused",
+                    "timestamp": "2026-03-01T10:00:00",
+                },
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "hello",
+                    "timestamp": "2026-03-01T10:01:00",
+                },
+            ],
+        )
+
+        result = self_reflect()
+        assert "Errors (1)" in result
+        assert "Ollama connection refused" in result
+
+    def test_self_reflect_detects_repeated_topics(self, tmp_path, mock_settings, monkeypatch):
+        """Should identify recurring user topics."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+        self._write_entries(
+            tmp_path,
+            [
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "tell me about bitcoin",
+                    "timestamp": "2026-03-01T10:00:00",
+                },
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "more about bitcoin please",
+                    "timestamp": "2026-03-01T10:01:00",
+                },
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "bitcoin price today",
+                    "timestamp": "2026-03-01T10:02:00",
+                },
+            ],
+        )
+
+        result = self_reflect()
+        assert "Recurring Topics" in result
+        assert "bitcoin" in result
+
+    def test_self_reflect_healthy_session(self, tmp_path, mock_settings, monkeypatch):
+        """Should report healthy when no issues found."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+        self._write_entries(
+            tmp_path,
+            [
+                {
+                    "type": "message",
+                    "role": "user",
+                    "content": "hi",
+                    "timestamp": "2026-03-01T10:00:00",
+                },
+                {
+                    "type": "message",
+                    "role": "timmy",
+                    "content": "Hello!",
+                    "confidence": 0.9,
+                    "timestamp": "2026-03-01T10:01:00",
+                },
+            ],
+        )
+
+        result = self_reflect()
+        assert "Self-Reflection Report" in result
+        assert "Keep up the good work" in result
+
+    def test_self_reflect_includes_insights(self, tmp_path, mock_settings, monkeypatch):
+        """Should include actionable insights section."""
+        sl = SessionLogger(logs_dir=tmp_path)
+        monkeypatch.setattr("timmy.session_logger._session_logger", sl)
+        self._write_entries(
+            tmp_path,
+            [
+                {
+                    "type": "message",
+                    "role": "timmy",
+                    "content": "I'm not sure about this",
+                    "confidence": 0.2,
+                    "timestamp": "2026-03-01T10:00:00",
+                },
+            ],
+        )
+
+        result = self_reflect()
+        assert "Insights" in result
+        assert "confidence was low" in result