fix: add self_reflect tool for past behavior review (#417)
All checks were successful
Tests / lint (push) Successful in 4s
Tests / test (push) Successful in 1m2s

Co-authored-by: Kimi Agent <kimi@timmy.local>
Co-committed-by: Kimi Agent <kimi@timmy.local>
This commit was merged in pull request #417.
This commit is contained in:
2026-03-19 09:39:14 -04:00
committed by hermes
parent 332fa373b8
commit 3afb62afb7
4 changed files with 306 additions and 5 deletions

View File

@@ -155,6 +155,34 @@ class SessionLogger:
"decisions": sum(1 for e in entries if e.get("type") == "decision"),
}
def get_recent_entries(self, limit: int = 50) -> list[dict]:
"""Load recent entries across all session logs.
Args:
limit: Maximum number of entries to return.
Returns:
List of entries (most recent first).
"""
entries: list[dict] = []
log_files = sorted(self.logs_dir.glob("session_*.jsonl"), reverse=True)
for log_file in log_files:
if len(entries) >= limit:
break
try:
with open(log_file) as f:
lines = [ln for ln in f if ln.strip()]
for line in reversed(lines):
if len(entries) >= limit:
break
try:
entries.append(json.loads(line))
except json.JSONDecodeError:
continue
except OSError:
continue
return entries
def search(self, query: str, role: str | None = None, limit: int = 10) -> list[dict]:
"""Search across all session logs for entries matching a query.
@@ -287,3 +315,120 @@ def session_history(query: str, role: str = "", limit: int = 10) -> str:
lines[-1] += f" ({source})"
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Confidence threshold used for flagging low-confidence responses
# ---------------------------------------------------------------------------
_LOW_CONFIDENCE_THRESHOLD = 0.5
def self_reflect(limit: int = 30) -> str:
"""Review recent conversations and reflect on Timmy's own behavior.
Scans past session entries for patterns: low-confidence responses,
errors, repeated topics, and conversation quality signals. Returns
a structured reflection that Timmy can use to improve.
Args:
limit: How many recent entries to review (default 30).
Returns:
A formatted self-reflection report.
"""
sl = get_session_logger()
sl.flush()
entries = sl.get_recent_entries(limit=limit)
if not entries:
return "No conversation history to reflect on yet."
# Categorize entries
messages = [e for e in entries if e.get("type") == "message"]
errors = [e for e in entries if e.get("type") == "error"]
timmy_msgs = [e for e in messages if e.get("role") == "timmy"]
user_msgs = [e for e in messages if e.get("role") == "user"]
# 1. Low-confidence responses
low_conf = [
m
for m in timmy_msgs
if m.get("confidence") is not None and m["confidence"] < _LOW_CONFIDENCE_THRESHOLD
]
# 2. Identify repeated user topics (simple word frequency)
topic_counts: dict[str, int] = {}
for m in user_msgs:
for word in (m.get("content") or "").lower().split():
cleaned = word.strip(".,!?\"'()[]")
if len(cleaned) > 3:
topic_counts[cleaned] = topic_counts.get(cleaned, 0) + 1
repeated = sorted(
((w, c) for w, c in topic_counts.items() if c >= 3),
key=lambda x: x[1],
reverse=True,
)[:5]
# Build reflection report
sections: list[str] = ["## Self-Reflection Report\n"]
sections.append(
f"Reviewed {len(entries)} recent entries: "
f"{len(user_msgs)} user messages, "
f"{len(timmy_msgs)} responses, "
f"{len(errors)} errors.\n"
)
# Low confidence
if low_conf:
sections.append(f"### Low-Confidence Responses ({len(low_conf)})")
for m in low_conf[:5]:
ts = (m.get("timestamp") or "?")[:19]
conf = m.get("confidence", 0)
text = (m.get("content") or "")[:120]
sections.append(f"- [{ts}] confidence={conf:.0%}: {text}")
sections.append("")
else:
sections.append(
"### Low-Confidence Responses\nNone found — all responses above threshold.\n"
)
# Errors
if errors:
sections.append(f"### Errors ({len(errors)})")
for e in errors[:5]:
ts = (e.get("timestamp") or "?")[:19]
err = (e.get("error") or "")[:120]
sections.append(f"- [{ts}] {err}")
sections.append("")
else:
sections.append("### Errors\nNo errors recorded.\n")
# Repeated topics
if repeated:
sections.append("### Recurring Topics")
for word, count in repeated:
sections.append(f'- "{word}" ({count} mentions)')
sections.append("")
else:
sections.append("### Recurring Topics\nNo strong patterns detected.\n")
# Actionable summary
insights: list[str] = []
if low_conf:
insights.append("Consider studying topics where confidence was low.")
if errors:
insights.append("Review error patterns for recurring infrastructure issues.")
if repeated:
top_topic = repeated[0][0]
insights.append(
f'User frequently asks about "{top_topic}" — consider deepening knowledge here.'
)
if not insights:
insights.append("Conversations look healthy. Keep up the good work.")
sections.append("### Insights")
for insight in insights:
sections.append(f"- {insight}")
return "\n".join(sections)

View File

@@ -594,9 +594,10 @@ def _register_introspection_tools(toolkit: Toolkit) -> None:
logger.debug("update_gitea_avatar tool not available: %s", exc)
try:
from timmy.session_logger import session_history
from timmy.session_logger import self_reflect, session_history
toolkit.register(session_history, name="session_history")
toolkit.register(self_reflect, name="self_reflect")
except (ImportError, AttributeError) as exc:
logger.warning("Tool execution failed (session_history registration): %s", exc)
logger.debug("session_history tool not available")
@@ -874,6 +875,11 @@ def _introspection_tool_catalog() -> dict:
"description": "Query Timmy's own thought history for past reflections and insights",
"available_in": ["orchestrator"],
},
"self_reflect": {
"name": "Self-Reflect",
"description": "Review recent conversations to spot patterns, low-confidence answers, and errors",
"available_in": ["orchestrator"],
},
"update_gitea_avatar": {
"name": "Update Gitea Avatar",
"description": "Generate and upload a wizard-themed avatar to Timmy's Gitea profile",

View File

@@ -1,5 +1,6 @@
"""Tests for GET /api/world/state endpoint and /api/world/ws relay."""
import asyncio
import json
import logging
import time
@@ -7,8 +8,6 @@ from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import asyncio
from dashboard.routes.world import (
_GROUND_TTL,
_REMIND_AFTER,
@@ -393,7 +392,6 @@ async def test_conversation_buffer_caps_at_max():
def test_log_bark_failure_logs_exception(caplog):
"""_log_bark_failure logs errors from failed bark tasks."""
import asyncio
loop = asyncio.new_event_loop()
@@ -410,7 +408,6 @@ def test_log_bark_failure_logs_exception(caplog):
def test_log_bark_failure_ignores_cancelled():
"""_log_bark_failure silently ignores cancelled tasks."""
import asyncio
task = MagicMock(spec=asyncio.Task)
task.cancelled.return_value = True

View File

@@ -12,6 +12,7 @@ from timmy.session_logger import (
flush_session_logs,
get_session_logger,
get_session_summary,
self_reflect,
session_history,
)
@@ -927,3 +928,155 @@ class TestSessionHistoryTool:
result = session_history("calculator")
assert "tool:calculator" in result
class TestSelfReflect:
"""Tests for the self_reflect() tool function."""
def _write_entries(self, logs_dir, entries):
"""Helper: write entries to today's session file."""
path = logs_dir / f"session_{date.today().isoformat()}.jsonl"
with open(path, "w") as f:
for entry in entries:
f.write(json.dumps(entry) + "\n")
def test_self_reflect_no_history(self, tmp_path, mock_settings, monkeypatch):
"""Should return a message when no history exists."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
result = self_reflect()
assert "No conversation history" in result
def test_self_reflect_detects_low_confidence(self, tmp_path, mock_settings, monkeypatch):
"""Should flag low-confidence responses."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
self._write_entries(
tmp_path,
[
{
"type": "message",
"role": "timmy",
"content": "I think maybe it could be X",
"confidence": 0.3,
"timestamp": "2026-03-01T10:00:00",
},
{
"type": "message",
"role": "timmy",
"content": "The answer is Y",
"confidence": 0.9,
"timestamp": "2026-03-01T10:01:00",
},
],
)
result = self_reflect()
assert "Low-Confidence Responses (1)" in result
assert "confidence=30%" in result
def test_self_reflect_detects_errors(self, tmp_path, mock_settings, monkeypatch):
"""Should report errors in reflection."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
self._write_entries(
tmp_path,
[
{
"type": "error",
"error": "Ollama connection refused",
"timestamp": "2026-03-01T10:00:00",
},
{
"type": "message",
"role": "user",
"content": "hello",
"timestamp": "2026-03-01T10:01:00",
},
],
)
result = self_reflect()
assert "Errors (1)" in result
assert "Ollama connection refused" in result
def test_self_reflect_detects_repeated_topics(self, tmp_path, mock_settings, monkeypatch):
"""Should identify recurring user topics."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
self._write_entries(
tmp_path,
[
{
"type": "message",
"role": "user",
"content": "tell me about bitcoin",
"timestamp": "2026-03-01T10:00:00",
},
{
"type": "message",
"role": "user",
"content": "more about bitcoin please",
"timestamp": "2026-03-01T10:01:00",
},
{
"type": "message",
"role": "user",
"content": "bitcoin price today",
"timestamp": "2026-03-01T10:02:00",
},
],
)
result = self_reflect()
assert "Recurring Topics" in result
assert "bitcoin" in result
def test_self_reflect_healthy_session(self, tmp_path, mock_settings, monkeypatch):
"""Should report healthy when no issues found."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
self._write_entries(
tmp_path,
[
{
"type": "message",
"role": "user",
"content": "hi",
"timestamp": "2026-03-01T10:00:00",
},
{
"type": "message",
"role": "timmy",
"content": "Hello!",
"confidence": 0.9,
"timestamp": "2026-03-01T10:01:00",
},
],
)
result = self_reflect()
assert "Self-Reflection Report" in result
assert "Keep up the good work" in result
def test_self_reflect_includes_insights(self, tmp_path, mock_settings, monkeypatch):
"""Should include actionable insights section."""
sl = SessionLogger(logs_dir=tmp_path)
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
self._write_entries(
tmp_path,
[
{
"type": "message",
"role": "timmy",
"content": "I'm not sure about this",
"confidence": 0.2,
"timestamp": "2026-03-01T10:00:00",
},
],
)
result = self_reflect()
assert "Insights" in result
assert "confidence was low" in result