From 3afb62afb7e2c235570b7daa19d08f02a1857b1e Mon Sep 17 00:00:00 2001 From: Kimi Agent Date: Thu, 19 Mar 2026 09:39:14 -0400 Subject: [PATCH] fix: add self_reflect tool for past behavior review (#417) Co-authored-by: Kimi Agent Co-committed-by: Kimi Agent --- src/timmy/session_logger.py | 145 +++++++++++++++++++++++++++ src/timmy/tools.py | 8 +- tests/dashboard/test_world_api.py | 5 +- tests/timmy/test_session_logger.py | 153 +++++++++++++++++++++++++++++ 4 files changed, 306 insertions(+), 5 deletions(-) diff --git a/src/timmy/session_logger.py b/src/timmy/session_logger.py index 68e0f03d..efed01b2 100644 --- a/src/timmy/session_logger.py +++ b/src/timmy/session_logger.py @@ -155,6 +155,34 @@ class SessionLogger: "decisions": sum(1 for e in entries if e.get("type") == "decision"), } + def get_recent_entries(self, limit: int = 50) -> list[dict]: + """Load recent entries across all session logs. + + Args: + limit: Maximum number of entries to return. + + Returns: + List of entries (most recent first). + """ + entries: list[dict] = [] + log_files = sorted(self.logs_dir.glob("session_*.jsonl"), reverse=True) + for log_file in log_files: + if len(entries) >= limit: + break + try: + with open(log_file) as f: + lines = [ln for ln in f if ln.strip()] + for line in reversed(lines): + if len(entries) >= limit: + break + try: + entries.append(json.loads(line)) + except json.JSONDecodeError: + continue + except OSError: + continue + return entries + def search(self, query: str, role: str | None = None, limit: int = 10) -> list[dict]: """Search across all session logs for entries matching a query. @@ -287,3 +315,120 @@ def session_history(query: str, role: str = "", limit: int = 10) -> str: lines[-1] += f" ({source})" return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Confidence threshold used for flagging low-confidence responses +# --------------------------------------------------------------------------- +_LOW_CONFIDENCE_THRESHOLD = 0.5 + + +def self_reflect(limit: int = 30) -> str: + """Review recent conversations and reflect on Timmy's own behavior. + + Scans past session entries for patterns: low-confidence responses, + errors, repeated topics, and conversation quality signals. Returns + a structured reflection that Timmy can use to improve. + + Args: + limit: How many recent entries to review (default 30). + + Returns: + A formatted self-reflection report. + """ + sl = get_session_logger() + sl.flush() + entries = sl.get_recent_entries(limit=limit) + + if not entries: + return "No conversation history to reflect on yet." + + # Categorize entries + messages = [e for e in entries if e.get("type") == "message"] + errors = [e for e in entries if e.get("type") == "error"] + timmy_msgs = [e for e in messages if e.get("role") == "timmy"] + user_msgs = [e for e in messages if e.get("role") == "user"] + + # 1. Low-confidence responses + low_conf = [ + m + for m in timmy_msgs + if m.get("confidence") is not None and m["confidence"] < _LOW_CONFIDENCE_THRESHOLD + ] + + # 2. Identify repeated user topics (simple word frequency) + topic_counts: dict[str, int] = {} + for m in user_msgs: + for word in (m.get("content") or "").lower().split(): + cleaned = word.strip(".,!?\"'()[]") + if len(cleaned) > 3: + topic_counts[cleaned] = topic_counts.get(cleaned, 0) + 1 + repeated = sorted( + ((w, c) for w, c in topic_counts.items() if c >= 3), + key=lambda x: x[1], + reverse=True, + )[:5] + + # Build reflection report + sections: list[str] = ["## Self-Reflection Report\n"] + + sections.append( + f"Reviewed {len(entries)} recent entries: " + f"{len(user_msgs)} user messages, " + f"{len(timmy_msgs)} responses, " + f"{len(errors)} errors.\n" + ) + + # Low confidence + if low_conf: + sections.append(f"### Low-Confidence Responses ({len(low_conf)})") + for m in low_conf[:5]: + ts = (m.get("timestamp") or "?")[:19] + conf = m.get("confidence", 0) + text = (m.get("content") or "")[:120] + sections.append(f"- [{ts}] confidence={conf:.0%}: {text}") + sections.append("") + else: + sections.append( + "### Low-Confidence Responses\nNone found — all responses above threshold.\n" + ) + + # Errors + if errors: + sections.append(f"### Errors ({len(errors)})") + for e in errors[:5]: + ts = (e.get("timestamp") or "?")[:19] + err = (e.get("error") or "")[:120] + sections.append(f"- [{ts}] {err}") + sections.append("") + else: + sections.append("### Errors\nNo errors recorded.\n") + + # Repeated topics + if repeated: + sections.append("### Recurring Topics") + for word, count in repeated: + sections.append(f'- "{word}" ({count} mentions)') + sections.append("") + else: + sections.append("### Recurring Topics\nNo strong patterns detected.\n") + + # Actionable summary + insights: list[str] = [] + if low_conf: + insights.append("Consider studying topics where confidence was low.") + if errors: + insights.append("Review error patterns for recurring infrastructure issues.") + if repeated: + top_topic = repeated[0][0] + insights.append( + f'User frequently asks about "{top_topic}" — consider deepening knowledge here.' + ) + if not insights: + insights.append("Conversations look healthy. Keep up the good work.") + + sections.append("### Insights") + for insight in insights: + sections.append(f"- {insight}") + + return "\n".join(sections) diff --git a/src/timmy/tools.py b/src/timmy/tools.py index 22592c1d..95f0f645 100644 --- a/src/timmy/tools.py +++ b/src/timmy/tools.py @@ -594,9 +594,10 @@ def _register_introspection_tools(toolkit: Toolkit) -> None: logger.debug("update_gitea_avatar tool not available: %s", exc) try: - from timmy.session_logger import session_history + from timmy.session_logger import self_reflect, session_history toolkit.register(session_history, name="session_history") + toolkit.register(self_reflect, name="self_reflect") except (ImportError, AttributeError) as exc: logger.warning("Tool execution failed (session_history registration): %s", exc) logger.debug("session_history tool not available") @@ -874,6 +875,11 @@ def _introspection_tool_catalog() -> dict: "description": "Query Timmy's own thought history for past reflections and insights", "available_in": ["orchestrator"], }, + "self_reflect": { + "name": "Self-Reflect", + "description": "Review recent conversations to spot patterns, low-confidence answers, and errors", + "available_in": ["orchestrator"], + }, "update_gitea_avatar": { "name": "Update Gitea Avatar", "description": "Generate and upload a wizard-themed avatar to Timmy's Gitea profile", diff --git a/tests/dashboard/test_world_api.py b/tests/dashboard/test_world_api.py index 24aa9bbc..2d6e7f23 100644 --- a/tests/dashboard/test_world_api.py +++ b/tests/dashboard/test_world_api.py @@ -1,5 +1,6 @@ """Tests for GET /api/world/state endpoint and /api/world/ws relay.""" +import asyncio import json import logging import time @@ -7,8 +8,6 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest -import asyncio - from dashboard.routes.world import ( _GROUND_TTL, _REMIND_AFTER, @@ -393,7 +392,6 @@ async def test_conversation_buffer_caps_at_max(): def test_log_bark_failure_logs_exception(caplog): """_log_bark_failure logs errors from failed bark tasks.""" - import asyncio loop = asyncio.new_event_loop() @@ -410,7 +408,6 @@ def test_log_bark_failure_logs_exception(caplog): def test_log_bark_failure_ignores_cancelled(): """_log_bark_failure silently ignores cancelled tasks.""" - import asyncio task = MagicMock(spec=asyncio.Task) task.cancelled.return_value = True diff --git a/tests/timmy/test_session_logger.py b/tests/timmy/test_session_logger.py index 5ac71dcb..55293922 100644 --- a/tests/timmy/test_session_logger.py +++ b/tests/timmy/test_session_logger.py @@ -12,6 +12,7 @@ from timmy.session_logger import ( flush_session_logs, get_session_logger, get_session_summary, + self_reflect, session_history, ) @@ -927,3 +928,155 @@ class TestSessionHistoryTool: result = session_history("calculator") assert "tool:calculator" in result + + +class TestSelfReflect: + """Tests for the self_reflect() tool function.""" + + def _write_entries(self, logs_dir, entries): + """Helper: write entries to today's session file.""" + path = logs_dir / f"session_{date.today().isoformat()}.jsonl" + with open(path, "w") as f: + for entry in entries: + f.write(json.dumps(entry) + "\n") + + def test_self_reflect_no_history(self, tmp_path, mock_settings, monkeypatch): + """Should return a message when no history exists.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + + result = self_reflect() + assert "No conversation history" in result + + def test_self_reflect_detects_low_confidence(self, tmp_path, mock_settings, monkeypatch): + """Should flag low-confidence responses.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + self._write_entries( + tmp_path, + [ + { + "type": "message", + "role": "timmy", + "content": "I think maybe it could be X", + "confidence": 0.3, + "timestamp": "2026-03-01T10:00:00", + }, + { + "type": "message", + "role": "timmy", + "content": "The answer is Y", + "confidence": 0.9, + "timestamp": "2026-03-01T10:01:00", + }, + ], + ) + + result = self_reflect() + assert "Low-Confidence Responses (1)" in result + assert "confidence=30%" in result + + def test_self_reflect_detects_errors(self, tmp_path, mock_settings, monkeypatch): + """Should report errors in reflection.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + self._write_entries( + tmp_path, + [ + { + "type": "error", + "error": "Ollama connection refused", + "timestamp": "2026-03-01T10:00:00", + }, + { + "type": "message", + "role": "user", + "content": "hello", + "timestamp": "2026-03-01T10:01:00", + }, + ], + ) + + result = self_reflect() + assert "Errors (1)" in result + assert "Ollama connection refused" in result + + def test_self_reflect_detects_repeated_topics(self, tmp_path, mock_settings, monkeypatch): + """Should identify recurring user topics.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + self._write_entries( + tmp_path, + [ + { + "type": "message", + "role": "user", + "content": "tell me about bitcoin", + "timestamp": "2026-03-01T10:00:00", + }, + { + "type": "message", + "role": "user", + "content": "more about bitcoin please", + "timestamp": "2026-03-01T10:01:00", + }, + { + "type": "message", + "role": "user", + "content": "bitcoin price today", + "timestamp": "2026-03-01T10:02:00", + }, + ], + ) + + result = self_reflect() + assert "Recurring Topics" in result + assert "bitcoin" in result + + def test_self_reflect_healthy_session(self, tmp_path, mock_settings, monkeypatch): + """Should report healthy when no issues found.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + self._write_entries( + tmp_path, + [ + { + "type": "message", + "role": "user", + "content": "hi", + "timestamp": "2026-03-01T10:00:00", + }, + { + "type": "message", + "role": "timmy", + "content": "Hello!", + "confidence": 0.9, + "timestamp": "2026-03-01T10:01:00", + }, + ], + ) + + result = self_reflect() + assert "Self-Reflection Report" in result + assert "Keep up the good work" in result + + def test_self_reflect_includes_insights(self, tmp_path, mock_settings, monkeypatch): + """Should include actionable insights section.""" + sl = SessionLogger(logs_dir=tmp_path) + monkeypatch.setattr("timmy.session_logger._session_logger", sl) + self._write_entries( + tmp_path, + [ + { + "type": "message", + "role": "timmy", + "content": "I'm not sure about this", + "confidence": 0.2, + "timestamp": "2026-03-01T10:00:00", + }, + ], + ) + + result = self_reflect() + assert "Insights" in result + assert "confidence was low" in result