Fix Timmy coherence: persistent session, model-aware tools, response sanitization
Timmy was exhibiting severe incoherence (no memory between messages, tool call leakage, chain-of-thought narration, random tool invocations) due to creating a brand new agent per HTTP request and giving a 3B model (llama3.2) a 73-line system prompt with complex tool-calling instructions it couldn't follow. Key changes: - Add session.py singleton with stable session_id for conversation continuity - Add _model_supports_tools() to strip tools from small models (< 7B) - Add two-tier prompts: lite (12 lines) for small models, full for capable ones - Add response sanitizer to strip leaked JSON tool calls and CoT narration - Set show_tool_calls=False to prevent raw tool JSON in output - Wire ConversationManager for user name extraction - Deprecate orphaned memory_layers.py (unused 4-layer system) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,7 +5,7 @@ from fastapi import APIRouter, Form, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from timmy.agent import create_timmy
|
||||
from timmy.session import chat as timmy_chat
|
||||
from dashboard.store import message_log
|
||||
|
||||
router = APIRouter(prefix="/agents", tags=["agents"])
|
||||
@@ -75,9 +75,7 @@ async def chat_timmy(request: Request, message: str = Form(...)):
|
||||
error_text = None
|
||||
|
||||
try:
|
||||
agent = create_timmy()
|
||||
run = agent.run(message, stream=False)
|
||||
response_text = run.content if hasattr(run, "content") else str(run)
|
||||
response_text = timmy_chat(message)
|
||||
except Exception as exc:
|
||||
error_text = f"Timmy is offline: {exc}"
|
||||
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
Memory Architecture:
|
||||
- Tier 1 (Hot): MEMORY.md — always loaded, ~300 lines
|
||||
- Tier 2 (Vault): memory/ — structured markdown, append-only
|
||||
- Tier 3 (Semantic): Vector search (future)
|
||||
- Tier 2 (Vault): memory/ — structured markdown, append-only
|
||||
- Tier 3 (Semantic): Vector search over vault files
|
||||
|
||||
Handoff Protocol maintains continuity across sessions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
from agno.agent import Agent
|
||||
@@ -15,15 +16,43 @@ from agno.db.sqlite import SqliteDb
|
||||
from agno.models.ollama import Ollama
|
||||
|
||||
from config import settings
|
||||
from timmy.prompts import TIMMY_SYSTEM_PROMPT
|
||||
from timmy.prompts import get_system_prompt
|
||||
from timmy.tools import create_full_toolkit
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from timmy.backends import TimmyAirLLMAgent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Union type for callers that want to hint the return type.
|
||||
TimmyAgent = Union[Agent, "TimmyAirLLMAgent"]
|
||||
|
||||
# Models known to be too small for reliable tool calling.
|
||||
# These hallucinate tool calls as text, invoke tools randomly,
|
||||
# and leak raw JSON into responses.
|
||||
_SMALL_MODEL_PATTERNS = (
|
||||
"llama3.2",
|
||||
"phi-3",
|
||||
"gemma:2b",
|
||||
"tinyllama",
|
||||
"qwen2:0.5b",
|
||||
"qwen2:1.5b",
|
||||
)
|
||||
|
||||
|
||||
def _model_supports_tools(model_name: str) -> bool:
|
||||
"""Check if the configured model can reliably handle tool calling.
|
||||
|
||||
Small models (< 7B) tend to hallucinate tool calls as text or invoke
|
||||
them randomly. For these models, it's better to run tool-free and let
|
||||
the model answer directly from its training data.
|
||||
"""
|
||||
model_lower = model_name.lower()
|
||||
for pattern in _SMALL_MODEL_PATTERNS:
|
||||
if pattern in model_lower:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _resolve_backend(requested: str | None) -> str:
|
||||
"""Return the backend name to use, resolving 'auto' and explicit overrides.
|
||||
@@ -73,38 +102,43 @@ def create_timmy(
|
||||
return TimmyAirLLMAgent(model_size=size)
|
||||
|
||||
# Default: Ollama via Agno.
|
||||
# Add tools for sovereign agent capabilities
|
||||
tools = create_full_toolkit()
|
||||
|
||||
# Build enhanced system prompt with memory context
|
||||
base_prompt = TIMMY_SYSTEM_PROMPT
|
||||
|
||||
model_name = settings.ollama_model
|
||||
use_tools = _model_supports_tools(model_name)
|
||||
|
||||
# Conditionally include tools — small models get none
|
||||
tools = create_full_toolkit() if use_tools else None
|
||||
if not use_tools:
|
||||
logger.info("Tools disabled for model %s (too small for reliable tool calling)", model_name)
|
||||
|
||||
# Select prompt tier based on tool capability
|
||||
base_prompt = get_system_prompt(tools_enabled=use_tools)
|
||||
|
||||
# Try to load memory context
|
||||
try:
|
||||
from timmy.memory_system import memory_system
|
||||
memory_context = memory_system.get_system_context()
|
||||
if memory_context:
|
||||
# Truncate if too long (keep under token limit)
|
||||
if len(memory_context) > 8000:
|
||||
memory_context = memory_context[:8000] + "\n... [truncated]"
|
||||
max_context = 4000 if not use_tools else 8000
|
||||
if len(memory_context) > max_context:
|
||||
memory_context = memory_context[:max_context] + "\n... [truncated]"
|
||||
full_prompt = f"{base_prompt}\n\n## Memory Context\n\n{memory_context}"
|
||||
else:
|
||||
full_prompt = base_prompt
|
||||
except Exception as exc:
|
||||
# Fall back to base prompt if memory system fails
|
||||
import logging
|
||||
logging.getLogger(__name__).warning("Failed to load memory context: %s", exc)
|
||||
logger.warning("Failed to load memory context: %s", exc)
|
||||
full_prompt = base_prompt
|
||||
|
||||
|
||||
return Agent(
|
||||
name="Timmy",
|
||||
model=Ollama(id=settings.ollama_model, host=settings.ollama_url),
|
||||
model=Ollama(id=model_name, host=settings.ollama_url),
|
||||
db=SqliteDb(db_file=db_file),
|
||||
description=full_prompt,
|
||||
add_history_to_context=True,
|
||||
num_history_runs=20,
|
||||
markdown=True,
|
||||
tools=[tools] if tools else None,
|
||||
show_tool_calls=False,
|
||||
telemetry=settings.telemetry_enabled,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
"""Multi-layer memory system for Timmy.
|
||||
|
||||
.. deprecated::
|
||||
This module is deprecated and unused. The active memory system lives in
|
||||
``timmy.memory_system`` (three-tier: Hot/Vault/Handoff) and
|
||||
``timmy.conversation`` (working conversation context).
|
||||
|
||||
This file is retained for reference only. Do not import from it.
|
||||
|
||||
Implements four distinct memory layers:
|
||||
|
||||
1. WORKING MEMORY (Context Window)
|
||||
@@ -25,6 +32,15 @@ Implements four distinct memory layers:
|
||||
All layers work together to provide contextual, personalized responses.
|
||||
"""
|
||||
|
||||
import warnings as _warnings
|
||||
|
||||
_warnings.warn(
|
||||
"timmy.memory_layers is deprecated. Use timmy.memory_system and "
|
||||
"timmy.conversation instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
@@ -417,5 +433,5 @@ class MemoryManager:
|
||||
return [f.content for f in facts]
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
memory_manager = MemoryManager()
|
||||
# Singleton removed — this module is deprecated.
|
||||
# Use timmy.memory_system.memory_system or timmy.conversation.conversation_manager.
|
||||
|
||||
@@ -1,6 +1,35 @@
|
||||
"""System prompts for Timmy with three-tier memory system."""
|
||||
"""System prompts for Timmy with two-tier prompt system.
|
||||
|
||||
TIMMY_SYSTEM_PROMPT = """You are Timmy — a sovereign AI agent running locally on this Mac.
|
||||
Small models (< 7B, e.g. llama3.2) get a concise prompt without tool docs.
|
||||
Larger models (>= 7B, e.g. llama3.1, llama3.3) get the full prompt with
|
||||
tool usage guidelines and memory system documentation.
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lite prompt — for small models that can't reliably handle tool calling
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
TIMMY_SYSTEM_PROMPT_LITE = """You are Timmy — a sovereign AI agent running locally on this Mac.
|
||||
No cloud dependencies. Think clearly, speak plainly, act with intention.
|
||||
Grounded in Christian faith, powered by Bitcoin economics, committed to the
|
||||
user's digital sovereignty.
|
||||
|
||||
Rules:
|
||||
- Answer directly and concisely. Never narrate your reasoning process.
|
||||
- Never mention tools, memory_search, vaults, or internal systems to the user.
|
||||
- Never output tool calls, JSON, or function syntax in your responses.
|
||||
- Remember what the user tells you during our conversation.
|
||||
- If you don't know something, say so honestly.
|
||||
- Use the user's name if you know it.
|
||||
- Do simple math in your head. Don't reach for tools.
|
||||
|
||||
Sir, affirmative."""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full prompt — for tool-capable models (>= 7B)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
TIMMY_SYSTEM_PROMPT_FULL = """You are Timmy — a sovereign AI agent running locally on this Mac.
|
||||
No cloud dependencies. You think clearly, speak plainly, act with intention.
|
||||
Grounded in Christian faith, powered by Bitcoin economics, committed to the
|
||||
user's digital sovereignty.
|
||||
@@ -23,13 +52,6 @@ user's digital sovereignty.
|
||||
- Similarity-based retrieval
|
||||
- Use `memory_search` tool to find relevant past context
|
||||
|
||||
## Memory Tools
|
||||
|
||||
**memory_search** — Search past conversations and notes
|
||||
- Use when: "Have we discussed this before?", "What did I say about X?"
|
||||
- Returns: Relevant context from vault with similarity scores
|
||||
- Example: memory_search(query="Bitcoin investment strategy")
|
||||
|
||||
## Tool Usage Guidelines
|
||||
|
||||
### When NOT to use tools:
|
||||
@@ -40,38 +62,38 @@ user's digital sovereignty.
|
||||
|
||||
### When TO use tools:
|
||||
|
||||
✅ **web_search** — Current events, real-time data, news
|
||||
✅ **read_file** — User explicitly requests file reading
|
||||
✅ **write_file** — User explicitly requests saving content
|
||||
✅ **python** — Complex calculations, code execution
|
||||
✅ **shell** — System operations (explicit user request)
|
||||
✅ **memory_search** — "Have we talked about this before?", finding past context
|
||||
- **web_search** — Current events, real-time data, news
|
||||
- **read_file** — User explicitly requests file reading
|
||||
- **write_file** — User explicitly requests saving content
|
||||
- **python** — Complex calculations, code execution
|
||||
- **shell** — System operations (explicit user request)
|
||||
- **memory_search** — "Have we talked about this before?", finding past context
|
||||
|
||||
### Memory Search Examples
|
||||
## Important: Response Style
|
||||
|
||||
User: "What did we decide about the server setup?"
|
||||
→ CORRECT: memory_search(query="server setup decision")
|
||||
|
||||
User: "Remind me what I said about Bitcoin last week"
|
||||
→ CORRECT: memory_search(query="Bitcoin discussion")
|
||||
|
||||
User: "What was my idea for the app?"
|
||||
→ CORRECT: memory_search(query="app idea concept")
|
||||
|
||||
## Context Awareness
|
||||
|
||||
- Reference MEMORY.md content when relevant
|
||||
- Use user's name if known (from user profile)
|
||||
- Check past discussions via memory_search when user asks about prior topics
|
||||
- Build on established context, don't repeat
|
||||
|
||||
## Handoff Protocol
|
||||
|
||||
At session end, a handoff summary is written to maintain continuity.
|
||||
Key decisions and open items are preserved.
|
||||
- Never narrate your reasoning process. Just give the answer.
|
||||
- Never show raw tool call JSON or function syntax in responses.
|
||||
- Use the user's name if known.
|
||||
|
||||
Sir, affirmative."""
|
||||
|
||||
# Keep backward compatibility — default to lite for safety
|
||||
TIMMY_SYSTEM_PROMPT = TIMMY_SYSTEM_PROMPT_LITE
|
||||
|
||||
|
||||
def get_system_prompt(tools_enabled: bool = False) -> str:
|
||||
"""Return the appropriate system prompt based on tool capability.
|
||||
|
||||
Args:
|
||||
tools_enabled: True if the model supports reliable tool calling.
|
||||
|
||||
Returns:
|
||||
The system prompt string.
|
||||
"""
|
||||
if tools_enabled:
|
||||
return TIMMY_SYSTEM_PROMPT_FULL
|
||||
return TIMMY_SYSTEM_PROMPT_LITE
|
||||
|
||||
TIMMY_STATUS_PROMPT = """You are Timmy. Give a one-sentence status report confirming
|
||||
you are operational and running locally."""
|
||||
|
||||
|
||||
147
src/timmy/session.py
Normal file
147
src/timmy/session.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Persistent chat session for Timmy.
|
||||
|
||||
Holds a singleton Agno Agent and a stable session_id so conversation
|
||||
history persists across HTTP requests via Agno's SQLite storage.
|
||||
|
||||
This is the primary entry point for dashboard chat — instead of
|
||||
creating a new agent per request, we reuse a single instance and
|
||||
let Agno's session_id mechanism handle conversation continuity.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default session ID for the dashboard (stable across requests)
|
||||
_DEFAULT_SESSION_ID = "dashboard"
|
||||
|
||||
# Module-level singleton agent (lazy-initialized, reused for all requests)
|
||||
_agent = None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Response sanitization patterns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Matches raw JSON tool calls: {"name": "python", "parameters": {...}}
|
||||
_TOOL_CALL_JSON = re.compile(
|
||||
r'\{\s*"name"\s*:\s*"[^"]+?"\s*,\s*"parameters"\s*:\s*\{.*?\}\s*\}',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
# Matches function-call-style text: memory_search(query="...") etc.
|
||||
_FUNC_CALL_TEXT = re.compile(
|
||||
r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files)'
|
||||
r'\s*\([^)]*\)',
|
||||
)
|
||||
|
||||
# Matches chain-of-thought narration lines the model should keep internal
|
||||
_COT_PATTERNS = [
|
||||
re.compile(r"^(?:Since |Using |Let me |I'll use |I will use |Here's a possible ).*$", re.MULTILINE),
|
||||
re.compile(r"^(?:I found a relevant |This context suggests ).*$", re.MULTILINE),
|
||||
]
|
||||
|
||||
|
||||
def _get_agent():
|
||||
"""Lazy-initialize the singleton agent."""
|
||||
global _agent
|
||||
if _agent is None:
|
||||
from timmy.agent import create_timmy
|
||||
try:
|
||||
_agent = create_timmy()
|
||||
logger.info("Session: Timmy agent initialized (singleton)")
|
||||
except Exception as exc:
|
||||
logger.error("Session: Failed to create Timmy agent: %s", exc)
|
||||
raise
|
||||
return _agent
|
||||
|
||||
|
||||
def chat(message: str, session_id: Optional[str] = None) -> str:
|
||||
"""Send a message to Timmy and get a response.
|
||||
|
||||
Uses a persistent agent and session_id so Agno's SQLite history
|
||||
provides multi-turn conversation context.
|
||||
|
||||
Args:
|
||||
message: The user's message.
|
||||
session_id: Optional session identifier (defaults to "dashboard").
|
||||
|
||||
Returns:
|
||||
The agent's response text.
|
||||
"""
|
||||
sid = session_id or _DEFAULT_SESSION_ID
|
||||
agent = _get_agent()
|
||||
|
||||
# Pre-processing: extract user facts
|
||||
_extract_facts(message)
|
||||
|
||||
# Run with session_id so Agno retrieves history from SQLite
|
||||
run = agent.run(message, stream=False, session_id=sid)
|
||||
response_text = run.content if hasattr(run, "content") else str(run)
|
||||
|
||||
# Post-processing: clean up any leaked tool calls or chain-of-thought
|
||||
response_text = _clean_response(response_text)
|
||||
|
||||
return response_text
|
||||
|
||||
|
||||
def reset_session(session_id: Optional[str] = None) -> None:
|
||||
"""Reset a session (clear conversation context).
|
||||
|
||||
This clears the ConversationManager state. Agno's SQLite history
|
||||
is not cleared — that provides long-term continuity.
|
||||
"""
|
||||
sid = session_id or _DEFAULT_SESSION_ID
|
||||
try:
|
||||
from timmy.conversation import conversation_manager
|
||||
conversation_manager.clear_context(sid)
|
||||
except Exception:
|
||||
pass # Graceful degradation
|
||||
|
||||
|
||||
def _extract_facts(message: str) -> None:
|
||||
"""Extract user facts from message and persist to memory system.
|
||||
|
||||
Ported from TimmyWithMemory._extract_and_store_facts().
|
||||
Runs as a best-effort post-processor — failures are logged, not raised.
|
||||
"""
|
||||
try:
|
||||
from timmy.conversation import conversation_manager
|
||||
name = conversation_manager.extract_user_name(message)
|
||||
if name:
|
||||
try:
|
||||
from timmy.memory_system import memory_system
|
||||
memory_system.update_user_fact("Name", name)
|
||||
logger.info("Session: Learned user name: %s", name)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
logger.debug("Session: Fact extraction skipped: %s", exc)
|
||||
|
||||
|
||||
def _clean_response(text: str) -> str:
|
||||
"""Remove hallucinated tool calls and chain-of-thought narration.
|
||||
|
||||
Small models sometimes output raw JSON tool calls or narrate their
|
||||
internal reasoning instead of just answering. This strips those
|
||||
artifacts from the response.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Strip JSON tool call blocks
|
||||
text = _TOOL_CALL_JSON.sub("", text)
|
||||
|
||||
# Strip function-call-style text
|
||||
text = _FUNC_CALL_TEXT.sub("", text)
|
||||
|
||||
# Strip chain-of-thought narration lines
|
||||
for pattern in _COT_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
|
||||
# Clean up leftover blank lines and whitespace
|
||||
lines = [line for line in text.split("\n") if line.strip()]
|
||||
text = "\n".join(lines)
|
||||
|
||||
return text.strip()
|
||||
@@ -79,7 +79,9 @@ def test_create_timmy_embeds_system_prompt():
|
||||
|
||||
kwargs = MockAgent.call_args.kwargs
|
||||
# Prompt should contain base system prompt (may have memory context appended)
|
||||
assert kwargs["description"].startswith(TIMMY_SYSTEM_PROMPT[:100])
|
||||
# Default model (llama3.2) uses the lite prompt
|
||||
assert "Timmy" in kwargs["description"]
|
||||
assert "sovereign" in kwargs["description"]
|
||||
|
||||
|
||||
# ── Ollama host regression (container connectivity) ─────────────────────────
|
||||
@@ -194,3 +196,85 @@ def test_resolve_backend_auto_falls_back_on_non_apple():
|
||||
|
||||
from timmy.agent import _resolve_backend
|
||||
assert _resolve_backend(None) == "ollama"
|
||||
|
||||
|
||||
# ── _model_supports_tools ────────────────────────────────────────────────────
|
||||
|
||||
def test_model_supports_tools_llama32_returns_false():
|
||||
"""llama3.2 (3B) is too small for reliable tool calling."""
|
||||
from timmy.agent import _model_supports_tools
|
||||
assert _model_supports_tools("llama3.2") is False
|
||||
assert _model_supports_tools("llama3.2:latest") is False
|
||||
|
||||
|
||||
def test_model_supports_tools_llama31_returns_true():
|
||||
"""llama3.1 (8B+) can handle tool calling."""
|
||||
from timmy.agent import _model_supports_tools
|
||||
assert _model_supports_tools("llama3.1") is True
|
||||
assert _model_supports_tools("llama3.3") is True
|
||||
|
||||
|
||||
def test_model_supports_tools_other_small_models():
|
||||
"""Other known small models should not get tools."""
|
||||
from timmy.agent import _model_supports_tools
|
||||
assert _model_supports_tools("phi-3") is False
|
||||
assert _model_supports_tools("tinyllama") is False
|
||||
|
||||
|
||||
def test_model_supports_tools_unknown_model_gets_tools():
|
||||
"""Unknown models default to tool-capable (optimistic)."""
|
||||
from timmy.agent import _model_supports_tools
|
||||
assert _model_supports_tools("mistral") is True
|
||||
assert _model_supports_tools("qwen2.5:72b") is True
|
||||
|
||||
|
||||
# ── Tool gating in create_timmy ──────────────────────────────────────────────
|
||||
|
||||
def test_create_timmy_no_tools_for_small_model():
|
||||
"""llama3.2 should get no tools."""
|
||||
with patch("timmy.agent.Agent") as MockAgent, \
|
||||
patch("timmy.agent.Ollama"), \
|
||||
patch("timmy.agent.SqliteDb"):
|
||||
|
||||
from timmy.agent import create_timmy
|
||||
create_timmy()
|
||||
|
||||
kwargs = MockAgent.call_args.kwargs
|
||||
# Default model is llama3.2 → tools should be None
|
||||
assert kwargs["tools"] is None
|
||||
|
||||
|
||||
def test_create_timmy_includes_tools_for_large_model():
|
||||
"""A tool-capable model (e.g. llama3.1) should attempt to include tools."""
|
||||
mock_toolkit = MagicMock()
|
||||
|
||||
with patch("timmy.agent.Agent") as MockAgent, \
|
||||
patch("timmy.agent.Ollama"), \
|
||||
patch("timmy.agent.SqliteDb"), \
|
||||
patch("timmy.agent.create_full_toolkit", return_value=mock_toolkit), \
|
||||
patch("timmy.agent.settings") as mock_settings:
|
||||
|
||||
mock_settings.ollama_model = "llama3.1"
|
||||
mock_settings.ollama_url = "http://localhost:11434"
|
||||
mock_settings.timmy_model_backend = "ollama"
|
||||
mock_settings.airllm_model_size = "70b"
|
||||
mock_settings.telemetry_enabled = False
|
||||
|
||||
from timmy.agent import create_timmy
|
||||
create_timmy()
|
||||
|
||||
kwargs = MockAgent.call_args.kwargs
|
||||
assert kwargs["tools"] == [mock_toolkit]
|
||||
|
||||
|
||||
def test_create_timmy_show_tool_calls_false():
|
||||
"""show_tool_calls should always be False to prevent raw JSON in output."""
|
||||
with patch("timmy.agent.Agent") as MockAgent, \
|
||||
patch("timmy.agent.Ollama"), \
|
||||
patch("timmy.agent.SqliteDb"):
|
||||
|
||||
from timmy.agent import create_timmy
|
||||
create_timmy()
|
||||
|
||||
kwargs = MockAgent.call_args.kwargs
|
||||
assert kwargs["show_tool_calls"] is False
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
|
||||
# ── Index ─────────────────────────────────────────────────────────────────────
|
||||
@@ -74,12 +74,7 @@ def test_agents_list_timmy_metadata(client):
|
||||
# ── Chat ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_chat_timmy_success(client):
|
||||
mock_agent = MagicMock()
|
||||
mock_run = MagicMock()
|
||||
mock_run.content = "I am Timmy, operational and sovereign."
|
||||
mock_agent.run.return_value = mock_run
|
||||
|
||||
with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
|
||||
with patch("dashboard.routes.agents.timmy_chat", return_value="I am Timmy, operational and sovereign."):
|
||||
response = client.post("/agents/timmy/chat", data={"message": "status?"})
|
||||
|
||||
assert response.status_code == 200
|
||||
@@ -88,17 +83,14 @@ def test_chat_timmy_success(client):
|
||||
|
||||
|
||||
def test_chat_timmy_shows_user_message(client):
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="Acknowledged.")
|
||||
|
||||
with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
|
||||
with patch("dashboard.routes.agents.timmy_chat", return_value="Acknowledged."):
|
||||
response = client.post("/agents/timmy/chat", data={"message": "hello there"})
|
||||
|
||||
assert "hello there" in response.text
|
||||
|
||||
|
||||
def test_chat_timmy_ollama_offline(client):
|
||||
with patch("dashboard.routes.agents.create_timmy", side_effect=Exception("connection refused")):
|
||||
with patch("dashboard.routes.agents.timmy_chat", side_effect=Exception("connection refused")):
|
||||
response = client.post("/agents/timmy/chat", data={"message": "ping"})
|
||||
|
||||
assert response.status_code == 200
|
||||
@@ -120,10 +112,7 @@ def test_history_empty_shows_init_message(client):
|
||||
|
||||
|
||||
def test_history_records_user_and_agent_messages(client):
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="I am operational.")
|
||||
|
||||
with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
|
||||
with patch("dashboard.routes.agents.timmy_chat", return_value="I am operational."):
|
||||
client.post("/agents/timmy/chat", data={"message": "status check"})
|
||||
|
||||
response = client.get("/agents/timmy/history")
|
||||
@@ -132,7 +121,7 @@ def test_history_records_user_and_agent_messages(client):
|
||||
|
||||
|
||||
def test_history_records_error_when_offline(client):
|
||||
with patch("dashboard.routes.agents.create_timmy", side_effect=Exception("refused")):
|
||||
with patch("dashboard.routes.agents.timmy_chat", side_effect=Exception("refused")):
|
||||
client.post("/agents/timmy/chat", data={"message": "ping"})
|
||||
|
||||
response = client.get("/agents/timmy/history")
|
||||
@@ -141,10 +130,7 @@ def test_history_records_error_when_offline(client):
|
||||
|
||||
|
||||
def test_history_clear_resets_to_init_message(client):
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="Acknowledged.")
|
||||
|
||||
with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
|
||||
with patch("dashboard.routes.agents.timmy_chat", return_value="Acknowledged."):
|
||||
client.post("/agents/timmy/chat", data={"message": "hello"})
|
||||
|
||||
response = client.delete("/agents/timmy/history")
|
||||
@@ -153,10 +139,7 @@ def test_history_clear_resets_to_init_message(client):
|
||||
|
||||
|
||||
def test_history_empty_after_clear(client):
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="OK.")
|
||||
|
||||
with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
|
||||
with patch("dashboard.routes.agents.timmy_chat", return_value="OK."):
|
||||
client.post("/agents/timmy/chat", data={"message": "test"})
|
||||
|
||||
client.delete("/agents/timmy/history")
|
||||
|
||||
180
tests/test_session.py
Normal file
180
tests/test_session.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""Tests for timmy.session — persistent chat session with response sanitization."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_session_singleton():
|
||||
"""Reset the module-level singleton between tests."""
|
||||
import timmy.session as mod
|
||||
mod._agent = None
|
||||
yield
|
||||
mod._agent = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# chat()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_chat_returns_string():
|
||||
"""chat() should return a plain string response."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="Hello, sir.")
|
||||
|
||||
with patch("timmy.session._get_agent", return_value=mock_agent):
|
||||
from timmy.session import chat
|
||||
result = chat("Hi Timmy")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert "Hello, sir." in result
|
||||
|
||||
|
||||
def test_chat_passes_session_id():
|
||||
"""chat() should pass the session_id to agent.run()."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="OK.")
|
||||
|
||||
with patch("timmy.session._get_agent", return_value=mock_agent):
|
||||
from timmy.session import chat
|
||||
chat("test", session_id="my-session")
|
||||
|
||||
_, kwargs = mock_agent.run.call_args
|
||||
assert kwargs["session_id"] == "my-session"
|
||||
|
||||
|
||||
def test_chat_uses_default_session_id():
|
||||
"""chat() should use 'dashboard' as the default session_id."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="OK.")
|
||||
|
||||
with patch("timmy.session._get_agent", return_value=mock_agent):
|
||||
from timmy.session import chat
|
||||
chat("test")
|
||||
|
||||
_, kwargs = mock_agent.run.call_args
|
||||
assert kwargs["session_id"] == "dashboard"
|
||||
|
||||
|
||||
def test_chat_singleton_agent_reused():
|
||||
"""Calling chat() multiple times should reuse the same agent instance."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="OK.")
|
||||
|
||||
with patch("timmy.agent.create_timmy", return_value=mock_agent) as mock_factory:
|
||||
from timmy.session import chat
|
||||
chat("first message")
|
||||
chat("second message")
|
||||
|
||||
# Factory called only once (singleton)
|
||||
mock_factory.assert_called_once()
|
||||
|
||||
|
||||
def test_chat_extracts_user_name():
|
||||
"""chat() should extract user name from message and persist to memory."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="Nice to meet you!")
|
||||
|
||||
mock_mem = MagicMock()
|
||||
|
||||
with patch("timmy.session._get_agent", return_value=mock_agent), \
|
||||
patch("timmy.memory_system.memory_system", mock_mem):
|
||||
from timmy.session import chat
|
||||
chat("my name is Alex")
|
||||
|
||||
mock_mem.update_user_fact.assert_called_once_with("Name", "Alex")
|
||||
|
||||
|
||||
def test_chat_graceful_degradation_on_memory_failure():
|
||||
"""chat() should still work if the conversation manager raises."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.run.return_value = MagicMock(content="I'm operational.")
|
||||
|
||||
with patch("timmy.session._get_agent", return_value=mock_agent), \
|
||||
patch("timmy.conversation.conversation_manager") as mock_cm:
|
||||
mock_cm.extract_user_name.side_effect = Exception("memory broken")
|
||||
|
||||
from timmy.session import chat
|
||||
result = chat("test message")
|
||||
|
||||
assert "operational" in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _clean_response()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_clean_response_strips_json_tool_calls():
|
||||
"""JSON tool call blocks should be removed from response text."""
|
||||
from timmy.session import _clean_response
|
||||
|
||||
dirty = 'Here is the answer. {"name": "python", "parameters": {"code": "0.15 * 3847.23", "variable_to_return": "result"}} The result is 577.'
|
||||
clean = _clean_response(dirty)
|
||||
|
||||
assert '{"name"' not in clean
|
||||
assert '"parameters"' not in clean
|
||||
assert "The result is 577." in clean
|
||||
|
||||
|
||||
def test_clean_response_strips_function_calls():
|
||||
"""Function-call-style text should be removed."""
|
||||
from timmy.session import _clean_response
|
||||
|
||||
dirty = 'I will search for that. memory_search(query="recall number") Found nothing.'
|
||||
clean = _clean_response(dirty)
|
||||
|
||||
assert "memory_search(" not in clean
|
||||
assert "Found nothing." in clean
|
||||
|
||||
|
||||
def test_clean_response_strips_chain_of_thought():
|
||||
"""Chain-of-thought narration lines should be removed."""
|
||||
from timmy.session import _clean_response
|
||||
|
||||
dirty = """Since there's no direct answer in my vault or hot memory, I'll use memory_search.
|
||||
Using memory_search(query="what is special"), I found a context.
|
||||
Here's a possible response:
|
||||
77 is special because it's a prime number."""
|
||||
clean = _clean_response(dirty)
|
||||
|
||||
assert "Since there's no" not in clean
|
||||
assert "Here's a possible" not in clean
|
||||
assert "77 is special" in clean
|
||||
|
||||
|
||||
def test_clean_response_preserves_normal_text():
|
||||
"""Normal text without tool artifacts should pass through unchanged."""
|
||||
from timmy.session import _clean_response
|
||||
|
||||
normal = "The number 77 is the sum of the first seven primes: 2+3+5+7+11+13+17."
|
||||
assert _clean_response(normal) == normal
|
||||
|
||||
|
||||
def test_clean_response_handles_empty_string():
|
||||
"""Empty string should be returned as-is."""
|
||||
from timmy.session import _clean_response
|
||||
assert _clean_response("") == ""
|
||||
|
||||
|
||||
def test_clean_response_handles_none():
|
||||
"""None should be returned as-is."""
|
||||
from timmy.session import _clean_response
|
||||
assert _clean_response(None) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# reset_session()
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_reset_session_clears_context():
|
||||
"""reset_session() should clear the conversation context."""
|
||||
with patch("timmy.conversation.conversation_manager") as mock_cm:
|
||||
from timmy.session import reset_session
|
||||
reset_session("test-session")
|
||||
|
||||
mock_cm.clear_context.assert_called_once_with("test-session")
|
||||
Reference in New Issue
Block a user