fix: add self_reflect tool for past behavior review (#417)
Co-authored-by: Kimi Agent <kimi@timmy.local> Co-committed-by: Kimi Agent <kimi@timmy.local>
This commit was merged in pull request #417.
This commit is contained in:
@@ -155,6 +155,34 @@ class SessionLogger:
|
||||
"decisions": sum(1 for e in entries if e.get("type") == "decision"),
|
||||
}
|
||||
|
||||
def get_recent_entries(self, limit: int = 50) -> list[dict]:
|
||||
"""Load recent entries across all session logs.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of entries to return.
|
||||
|
||||
Returns:
|
||||
List of entries (most recent first).
|
||||
"""
|
||||
entries: list[dict] = []
|
||||
log_files = sorted(self.logs_dir.glob("session_*.jsonl"), reverse=True)
|
||||
for log_file in log_files:
|
||||
if len(entries) >= limit:
|
||||
break
|
||||
try:
|
||||
with open(log_file) as f:
|
||||
lines = [ln for ln in f if ln.strip()]
|
||||
for line in reversed(lines):
|
||||
if len(entries) >= limit:
|
||||
break
|
||||
try:
|
||||
entries.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except OSError:
|
||||
continue
|
||||
return entries
|
||||
|
||||
def search(self, query: str, role: str | None = None, limit: int = 10) -> list[dict]:
|
||||
"""Search across all session logs for entries matching a query.
|
||||
|
||||
@@ -287,3 +315,120 @@ def session_history(query: str, role: str = "", limit: int = 10) -> str:
|
||||
lines[-1] += f" ({source})"
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Confidence threshold used for flagging low-confidence responses
|
||||
# ---------------------------------------------------------------------------
|
||||
_LOW_CONFIDENCE_THRESHOLD = 0.5
|
||||
|
||||
|
||||
def self_reflect(limit: int = 30) -> str:
|
||||
"""Review recent conversations and reflect on Timmy's own behavior.
|
||||
|
||||
Scans past session entries for patterns: low-confidence responses,
|
||||
errors, repeated topics, and conversation quality signals. Returns
|
||||
a structured reflection that Timmy can use to improve.
|
||||
|
||||
Args:
|
||||
limit: How many recent entries to review (default 30).
|
||||
|
||||
Returns:
|
||||
A formatted self-reflection report.
|
||||
"""
|
||||
sl = get_session_logger()
|
||||
sl.flush()
|
||||
entries = sl.get_recent_entries(limit=limit)
|
||||
|
||||
if not entries:
|
||||
return "No conversation history to reflect on yet."
|
||||
|
||||
# Categorize entries
|
||||
messages = [e for e in entries if e.get("type") == "message"]
|
||||
errors = [e for e in entries if e.get("type") == "error"]
|
||||
timmy_msgs = [e for e in messages if e.get("role") == "timmy"]
|
||||
user_msgs = [e for e in messages if e.get("role") == "user"]
|
||||
|
||||
# 1. Low-confidence responses
|
||||
low_conf = [
|
||||
m
|
||||
for m in timmy_msgs
|
||||
if m.get("confidence") is not None and m["confidence"] < _LOW_CONFIDENCE_THRESHOLD
|
||||
]
|
||||
|
||||
# 2. Identify repeated user topics (simple word frequency)
|
||||
topic_counts: dict[str, int] = {}
|
||||
for m in user_msgs:
|
||||
for word in (m.get("content") or "").lower().split():
|
||||
cleaned = word.strip(".,!?\"'()[]")
|
||||
if len(cleaned) > 3:
|
||||
topic_counts[cleaned] = topic_counts.get(cleaned, 0) + 1
|
||||
repeated = sorted(
|
||||
((w, c) for w, c in topic_counts.items() if c >= 3),
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)[:5]
|
||||
|
||||
# Build reflection report
|
||||
sections: list[str] = ["## Self-Reflection Report\n"]
|
||||
|
||||
sections.append(
|
||||
f"Reviewed {len(entries)} recent entries: "
|
||||
f"{len(user_msgs)} user messages, "
|
||||
f"{len(timmy_msgs)} responses, "
|
||||
f"{len(errors)} errors.\n"
|
||||
)
|
||||
|
||||
# Low confidence
|
||||
if low_conf:
|
||||
sections.append(f"### Low-Confidence Responses ({len(low_conf)})")
|
||||
for m in low_conf[:5]:
|
||||
ts = (m.get("timestamp") or "?")[:19]
|
||||
conf = m.get("confidence", 0)
|
||||
text = (m.get("content") or "")[:120]
|
||||
sections.append(f"- [{ts}] confidence={conf:.0%}: {text}")
|
||||
sections.append("")
|
||||
else:
|
||||
sections.append(
|
||||
"### Low-Confidence Responses\nNone found — all responses above threshold.\n"
|
||||
)
|
||||
|
||||
# Errors
|
||||
if errors:
|
||||
sections.append(f"### Errors ({len(errors)})")
|
||||
for e in errors[:5]:
|
||||
ts = (e.get("timestamp") or "?")[:19]
|
||||
err = (e.get("error") or "")[:120]
|
||||
sections.append(f"- [{ts}] {err}")
|
||||
sections.append("")
|
||||
else:
|
||||
sections.append("### Errors\nNo errors recorded.\n")
|
||||
|
||||
# Repeated topics
|
||||
if repeated:
|
||||
sections.append("### Recurring Topics")
|
||||
for word, count in repeated:
|
||||
sections.append(f'- "{word}" ({count} mentions)')
|
||||
sections.append("")
|
||||
else:
|
||||
sections.append("### Recurring Topics\nNo strong patterns detected.\n")
|
||||
|
||||
# Actionable summary
|
||||
insights: list[str] = []
|
||||
if low_conf:
|
||||
insights.append("Consider studying topics where confidence was low.")
|
||||
if errors:
|
||||
insights.append("Review error patterns for recurring infrastructure issues.")
|
||||
if repeated:
|
||||
top_topic = repeated[0][0]
|
||||
insights.append(
|
||||
f'User frequently asks about "{top_topic}" — consider deepening knowledge here.'
|
||||
)
|
||||
if not insights:
|
||||
insights.append("Conversations look healthy. Keep up the good work.")
|
||||
|
||||
sections.append("### Insights")
|
||||
for insight in insights:
|
||||
sections.append(f"- {insight}")
|
||||
|
||||
return "\n".join(sections)
|
||||
|
||||
@@ -594,9 +594,10 @@ def _register_introspection_tools(toolkit: Toolkit) -> None:
|
||||
logger.debug("update_gitea_avatar tool not available: %s", exc)
|
||||
|
||||
try:
|
||||
from timmy.session_logger import session_history
|
||||
from timmy.session_logger import self_reflect, session_history
|
||||
|
||||
toolkit.register(session_history, name="session_history")
|
||||
toolkit.register(self_reflect, name="self_reflect")
|
||||
except (ImportError, AttributeError) as exc:
|
||||
logger.warning("Tool execution failed (session_history registration): %s", exc)
|
||||
logger.debug("session_history tool not available")
|
||||
@@ -874,6 +875,11 @@ def _introspection_tool_catalog() -> dict:
|
||||
"description": "Query Timmy's own thought history for past reflections and insights",
|
||||
"available_in": ["orchestrator"],
|
||||
},
|
||||
"self_reflect": {
|
||||
"name": "Self-Reflect",
|
||||
"description": "Review recent conversations to spot patterns, low-confidence answers, and errors",
|
||||
"available_in": ["orchestrator"],
|
||||
},
|
||||
"update_gitea_avatar": {
|
||||
"name": "Update Gitea Avatar",
|
||||
"description": "Generate and upload a wizard-themed avatar to Timmy's Gitea profile",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Tests for GET /api/world/state endpoint and /api/world/ws relay."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
@@ -7,8 +8,6 @@ from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import asyncio
|
||||
|
||||
from dashboard.routes.world import (
|
||||
_GROUND_TTL,
|
||||
_REMIND_AFTER,
|
||||
@@ -393,7 +392,6 @@ async def test_conversation_buffer_caps_at_max():
|
||||
|
||||
def test_log_bark_failure_logs_exception(caplog):
|
||||
"""_log_bark_failure logs errors from failed bark tasks."""
|
||||
import asyncio
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
|
||||
@@ -410,7 +408,6 @@ def test_log_bark_failure_logs_exception(caplog):
|
||||
|
||||
def test_log_bark_failure_ignores_cancelled():
|
||||
"""_log_bark_failure silently ignores cancelled tasks."""
|
||||
import asyncio
|
||||
|
||||
task = MagicMock(spec=asyncio.Task)
|
||||
task.cancelled.return_value = True
|
||||
|
||||
@@ -12,6 +12,7 @@ from timmy.session_logger import (
|
||||
flush_session_logs,
|
||||
get_session_logger,
|
||||
get_session_summary,
|
||||
self_reflect,
|
||||
session_history,
|
||||
)
|
||||
|
||||
@@ -927,3 +928,155 @@ class TestSessionHistoryTool:
|
||||
|
||||
result = session_history("calculator")
|
||||
assert "tool:calculator" in result
|
||||
|
||||
|
||||
class TestSelfReflect:
|
||||
"""Tests for the self_reflect() tool function."""
|
||||
|
||||
def _write_entries(self, logs_dir, entries):
|
||||
"""Helper: write entries to today's session file."""
|
||||
path = logs_dir / f"session_{date.today().isoformat()}.jsonl"
|
||||
with open(path, "w") as f:
|
||||
for entry in entries:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
|
||||
def test_self_reflect_no_history(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should return a message when no history exists."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
|
||||
result = self_reflect()
|
||||
assert "No conversation history" in result
|
||||
|
||||
def test_self_reflect_detects_low_confidence(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should flag low-confidence responses."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
self._write_entries(
|
||||
tmp_path,
|
||||
[
|
||||
{
|
||||
"type": "message",
|
||||
"role": "timmy",
|
||||
"content": "I think maybe it could be X",
|
||||
"confidence": 0.3,
|
||||
"timestamp": "2026-03-01T10:00:00",
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "timmy",
|
||||
"content": "The answer is Y",
|
||||
"confidence": 0.9,
|
||||
"timestamp": "2026-03-01T10:01:00",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
result = self_reflect()
|
||||
assert "Low-Confidence Responses (1)" in result
|
||||
assert "confidence=30%" in result
|
||||
|
||||
def test_self_reflect_detects_errors(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should report errors in reflection."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
self._write_entries(
|
||||
tmp_path,
|
||||
[
|
||||
{
|
||||
"type": "error",
|
||||
"error": "Ollama connection refused",
|
||||
"timestamp": "2026-03-01T10:00:00",
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "hello",
|
||||
"timestamp": "2026-03-01T10:01:00",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
result = self_reflect()
|
||||
assert "Errors (1)" in result
|
||||
assert "Ollama connection refused" in result
|
||||
|
||||
def test_self_reflect_detects_repeated_topics(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should identify recurring user topics."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
self._write_entries(
|
||||
tmp_path,
|
||||
[
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "tell me about bitcoin",
|
||||
"timestamp": "2026-03-01T10:00:00",
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "more about bitcoin please",
|
||||
"timestamp": "2026-03-01T10:01:00",
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "bitcoin price today",
|
||||
"timestamp": "2026-03-01T10:02:00",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
result = self_reflect()
|
||||
assert "Recurring Topics" in result
|
||||
assert "bitcoin" in result
|
||||
|
||||
def test_self_reflect_healthy_session(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should report healthy when no issues found."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
self._write_entries(
|
||||
tmp_path,
|
||||
[
|
||||
{
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": "hi",
|
||||
"timestamp": "2026-03-01T10:00:00",
|
||||
},
|
||||
{
|
||||
"type": "message",
|
||||
"role": "timmy",
|
||||
"content": "Hello!",
|
||||
"confidence": 0.9,
|
||||
"timestamp": "2026-03-01T10:01:00",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
result = self_reflect()
|
||||
assert "Self-Reflection Report" in result
|
||||
assert "Keep up the good work" in result
|
||||
|
||||
def test_self_reflect_includes_insights(self, tmp_path, mock_settings, monkeypatch):
|
||||
"""Should include actionable insights section."""
|
||||
sl = SessionLogger(logs_dir=tmp_path)
|
||||
monkeypatch.setattr("timmy.session_logger._session_logger", sl)
|
||||
self._write_entries(
|
||||
tmp_path,
|
||||
[
|
||||
{
|
||||
"type": "message",
|
||||
"role": "timmy",
|
||||
"content": "I'm not sure about this",
|
||||
"confidence": 0.2,
|
||||
"timestamp": "2026-03-01T10:00:00",
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
result = self_reflect()
|
||||
assert "Insights" in result
|
||||
assert "confidence was low" in result
|
||||
|
||||
Reference in New Issue
Block a user