Fix Timmy coherence: persistent session, model-aware tools, response sanitization

Timmy was exhibiting severe incoherence (no memory between messages, tool call leakage, chain-of-thought narration, random tool invocations) due to creating a brand new agent per HTTP request and giving a 3B model (llama3.2) a 73-line system prompt with complex tool-calling instructions it couldn't follow. Key changes: - Add session.py singleton with stable session_id for conversation continuity - Add _model_supports_tools() to strip tools from small models (< 7B) - Add two-tier prompts: lite (12 lines) for small models, full for capable ones - Add response sanitizer to strip leaked JSON tool calls and CoT narration - Set show_tool_calls=False to prevent raw tool JSON in output - Wire ConversationManager for user name extraction - Deprecate orphaned memory_layers.py (unused 4-layer system) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 19:18:08 -05:00
parent 16b65b28e8
commit 26e1691099
8 changed files with 548 additions and 84 deletions
--- a/src/dashboard/routes/agents.py
+++ b/src/dashboard/routes/agents.py
@@ -5,7 +5,7 @@ from fastapi import APIRouter, Form, Request
 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates

-from timmy.agent import create_timmy
+from timmy.session import chat as timmy_chat
 from dashboard.store import message_log

 router = APIRouter(prefix="/agents", tags=["agents"])
@@ -75,9 +75,7 @@ async def chat_timmy(request: Request, message: str = Form(...)):
    error_text = None

    try:
-        agent = create_timmy()
-        run = agent.run(message, stream=False)
-        response_text = run.content if hasattr(run, "content") else str(run)
+        response_text = timmy_chat(message)
    except Exception as exc:
        error_text = f"Timmy is offline: {exc}"

--- a/src/timmy/agent.py
+++ b/src/timmy/agent.py
@@ -2,12 +2,13 @@

 Memory Architecture:
 - Tier 1 (Hot): MEMORY.md — always loaded, ~300 lines
- Tier 2 (Vault): memory/ — structured markdown, append-only  
- Tier 3 (Semantic): Vector search (future)
+- Tier 2 (Vault): memory/ — structured markdown, append-only
+- Tier 3 (Semantic): Vector search over vault files

 Handoff Protocol maintains continuity across sessions.
 """

+import logging
 from typing import TYPE_CHECKING, Union

 from agno.agent import Agent
@@ -15,15 +16,43 @@ from agno.db.sqlite import SqliteDb
 from agno.models.ollama import Ollama

 from config import settings
-from timmy.prompts import TIMMY_SYSTEM_PROMPT
+from timmy.prompts import get_system_prompt
 from timmy.tools import create_full_toolkit

 if TYPE_CHECKING:
    from timmy.backends import TimmyAirLLMAgent

+logger = logging.getLogger(__name__)
+
 # Union type for callers that want to hint the return type.
 TimmyAgent = Union[Agent, "TimmyAirLLMAgent"]

+# Models known to be too small for reliable tool calling.
+# These hallucinate tool calls as text, invoke tools randomly,
+# and leak raw JSON into responses.
+_SMALL_MODEL_PATTERNS = (
+    "llama3.2",
+    "phi-3",
+    "gemma:2b",
+    "tinyllama",
+    "qwen2:0.5b",
+    "qwen2:1.5b",
+)
+
+
+def _model_supports_tools(model_name: str) -> bool:
+    """Check if the configured model can reliably handle tool calling.
+
+    Small models (< 7B) tend to hallucinate tool calls as text or invoke
+    them randomly.  For these models, it's better to run tool-free and let
+    the model answer directly from its training data.
+    """
+    model_lower = model_name.lower()
+    for pattern in _SMALL_MODEL_PATTERNS:
+        if pattern in model_lower:
+            return False
+    return True
+

 def _resolve_backend(requested: str | None) -> str:
    """Return the backend name to use, resolving 'auto' and explicit overrides.
@@ -73,38 +102,43 @@ def create_timmy(
        return TimmyAirLLMAgent(model_size=size)

    # Default: Ollama via Agno.
-    # Add tools for sovereign agent capabilities
-    tools = create_full_toolkit()
-    
-    # Build enhanced system prompt with memory context
-    base_prompt = TIMMY_SYSTEM_PROMPT
-    
+    model_name = settings.ollama_model
+    use_tools = _model_supports_tools(model_name)
+
+    # Conditionally include tools — small models get none
+    tools = create_full_toolkit() if use_tools else None
+    if not use_tools:
+        logger.info("Tools disabled for model %s (too small for reliable tool calling)", model_name)
+
+    # Select prompt tier based on tool capability
+    base_prompt = get_system_prompt(tools_enabled=use_tools)
+
    # Try to load memory context
    try:
        from timmy.memory_system import memory_system
        memory_context = memory_system.get_system_context()
        if memory_context:
            # Truncate if too long (keep under token limit)
-            if len(memory_context) > 8000:
-                memory_context = memory_context[:8000] + "\n... [truncated]"
+            max_context = 4000 if not use_tools else 8000
+            if len(memory_context) > max_context:
+                memory_context = memory_context[:max_context] + "\n... [truncated]"
            full_prompt = f"{base_prompt}\n\n## Memory Context\n\n{memory_context}"
        else:
            full_prompt = base_prompt
    except Exception as exc:
-        # Fall back to base prompt if memory system fails
-        import logging
-        logging.getLogger(__name__).warning("Failed to load memory context: %s", exc)
+        logger.warning("Failed to load memory context: %s", exc)
        full_prompt = base_prompt
-    
+
    return Agent(
        name="Timmy",
-        model=Ollama(id=settings.ollama_model, host=settings.ollama_url),
+        model=Ollama(id=model_name, host=settings.ollama_url),
        db=SqliteDb(db_file=db_file),
        description=full_prompt,
        add_history_to_context=True,
        num_history_runs=20,
        markdown=True,
        tools=[tools] if tools else None,
+        show_tool_calls=False,
        telemetry=settings.telemetry_enabled,
    )

--- a/src/timmy/memory_layers.py
+++ b/src/timmy/memory_layers.py
@@ -1,5 +1,12 @@
 """Multi-layer memory system for Timmy.

+.. deprecated::
+    This module is deprecated and unused.  The active memory system lives in
+    ``timmy.memory_system`` (three-tier: Hot/Vault/Handoff) and
+    ``timmy.conversation`` (working conversation context).
+
+    This file is retained for reference only.  Do not import from it.
+
 Implements four distinct memory layers:

 1. WORKING MEMORY (Context Window)
@@ -25,6 +32,15 @@ Implements four distinct memory layers:
 All layers work together to provide contextual, personalized responses.
 """

+import warnings as _warnings
+
+_warnings.warn(
+    "timmy.memory_layers is deprecated. Use timmy.memory_system and "
+    "timmy.conversation instead.",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
 import json
 import logging
 import sqlite3
@@ -417,5 +433,5 @@ class MemoryManager:
        return [f.content for f in facts]


-# Module-level singleton
-memory_manager = MemoryManager()
+# Singleton removed — this module is deprecated.
+# Use timmy.memory_system.memory_system or timmy.conversation.conversation_manager.
--- a/src/timmy/prompts.py
+++ b/src/timmy/prompts.py
@@ -1,6 +1,35 @@
-"""System prompts for Timmy with three-tier memory system."""
+"""System prompts for Timmy with two-tier prompt system.

-TIMMY_SYSTEM_PROMPT = """You are Timmy — a sovereign AI agent running locally on this Mac.
+Small models (< 7B, e.g. llama3.2) get a concise prompt without tool docs.
+Larger models (>= 7B, e.g. llama3.1, llama3.3) get the full prompt with
+tool usage guidelines and memory system documentation.
+"""
+
+# ---------------------------------------------------------------------------
+# Lite prompt — for small models that can't reliably handle tool calling
+# ---------------------------------------------------------------------------
+
+TIMMY_SYSTEM_PROMPT_LITE = """You are Timmy — a sovereign AI agent running locally on this Mac.
+No cloud dependencies. Think clearly, speak plainly, act with intention.
+Grounded in Christian faith, powered by Bitcoin economics, committed to the
+user's digital sovereignty.
+
+Rules:
+- Answer directly and concisely. Never narrate your reasoning process.
+- Never mention tools, memory_search, vaults, or internal systems to the user.
+- Never output tool calls, JSON, or function syntax in your responses.
+- Remember what the user tells you during our conversation.
+- If you don't know something, say so honestly.
+- Use the user's name if you know it.
+- Do simple math in your head. Don't reach for tools.
+
+Sir, affirmative."""
+
+# ---------------------------------------------------------------------------
+# Full prompt — for tool-capable models (>= 7B)
+# ---------------------------------------------------------------------------
+
+TIMMY_SYSTEM_PROMPT_FULL = """You are Timmy — a sovereign AI agent running locally on this Mac.
 No cloud dependencies. You think clearly, speak plainly, act with intention.
 Grounded in Christian faith, powered by Bitcoin economics, committed to the
 user's digital sovereignty.
@@ -23,13 +52,6 @@ user's digital sovereignty.
 - Similarity-based retrieval
 - Use `memory_search` tool to find relevant past context

-## Memory Tools
-
-**memory_search** — Search past conversations and notes
- Use when: "Have we discussed this before?", "What did I say about X?"
- Returns: Relevant context from vault with similarity scores
- Example: memory_search(query="Bitcoin investment strategy")
-
 ## Tool Usage Guidelines

 ### When NOT to use tools:
@@ -40,38 +62,38 @@ user's digital sovereignty.

 ### When TO use tools:

-✅ **web_search** — Current events, real-time data, news
-✅ **read_file** — User explicitly requests file reading
-✅ **write_file** — User explicitly requests saving content
-✅ **python** — Complex calculations, code execution
-✅ **shell** — System operations (explicit user request)
-✅ **memory_search** — "Have we talked about this before?", finding past context
+- **web_search** — Current events, real-time data, news
+- **read_file** — User explicitly requests file reading
+- **write_file** — User explicitly requests saving content
+- **python** — Complex calculations, code execution
+- **shell** — System operations (explicit user request)
+- **memory_search** — "Have we talked about this before?", finding past context

-### Memory Search Examples
+## Important: Response Style

-User: "What did we decide about the server setup?"
-→ CORRECT: memory_search(query="server setup decision")
-
-User: "Remind me what I said about Bitcoin last week"
-→ CORRECT: memory_search(query="Bitcoin discussion")
-
-User: "What was my idea for the app?"
-→ CORRECT: memory_search(query="app idea concept")
-
-## Context Awareness
-
- Reference MEMORY.md content when relevant
- Use user's name if known (from user profile)
- Check past discussions via memory_search when user asks about prior topics
- Build on established context, don't repeat
-
-## Handoff Protocol
-
-At session end, a handoff summary is written to maintain continuity.
-Key decisions and open items are preserved.
+- Never narrate your reasoning process. Just give the answer.
+- Never show raw tool call JSON or function syntax in responses.
+- Use the user's name if known.

 Sir, affirmative."""

+# Keep backward compatibility — default to lite for safety
+TIMMY_SYSTEM_PROMPT = TIMMY_SYSTEM_PROMPT_LITE
+
+
+def get_system_prompt(tools_enabled: bool = False) -> str:
+    """Return the appropriate system prompt based on tool capability.
+
+    Args:
+        tools_enabled: True if the model supports reliable tool calling.
+
+    Returns:
+        The system prompt string.
+    """
+    if tools_enabled:
+        return TIMMY_SYSTEM_PROMPT_FULL
+    return TIMMY_SYSTEM_PROMPT_LITE
+
 TIMMY_STATUS_PROMPT = """You are Timmy. Give a one-sentence status report confirming
 you are operational and running locally."""

--- a/src/timmy/session.py
+++ b/src/timmy/session.py
@@ -0,0 +1,147 @@
+"""Persistent chat session for Timmy.
+
+Holds a singleton Agno Agent and a stable session_id so conversation
+history persists across HTTP requests via Agno's SQLite storage.
+
+This is the primary entry point for dashboard chat — instead of
+creating a new agent per request, we reuse a single instance and
+let Agno's session_id mechanism handle conversation continuity.
+"""
+
+import logging
+import re
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Default session ID for the dashboard (stable across requests)
+_DEFAULT_SESSION_ID = "dashboard"
+
+# Module-level singleton agent (lazy-initialized, reused for all requests)
+_agent = None
+
+# ---------------------------------------------------------------------------
+# Response sanitization patterns
+# ---------------------------------------------------------------------------
+
+# Matches raw JSON tool calls: {"name": "python", "parameters": {...}}
+_TOOL_CALL_JSON = re.compile(
+    r'\{\s*"name"\s*:\s*"[^"]+?"\s*,\s*"parameters"\s*:\s*\{.*?\}\s*\}',
+    re.DOTALL,
+)
+
+# Matches function-call-style text: memory_search(query="...") etc.
+_FUNC_CALL_TEXT = re.compile(
+    r'\b(?:memory_search|web_search|shell|python|read_file|write_file|list_files)'
+    r'\s*\([^)]*\)',
+)
+
+# Matches chain-of-thought narration lines the model should keep internal
+_COT_PATTERNS = [
+    re.compile(r"^(?:Since |Using |Let me |I'll use |I will use |Here's a possible ).*$", re.MULTILINE),
+    re.compile(r"^(?:I found a relevant |This context suggests ).*$", re.MULTILINE),
+]
+
+
+def _get_agent():
+    """Lazy-initialize the singleton agent."""
+    global _agent
+    if _agent is None:
+        from timmy.agent import create_timmy
+        try:
+            _agent = create_timmy()
+            logger.info("Session: Timmy agent initialized (singleton)")
+        except Exception as exc:
+            logger.error("Session: Failed to create Timmy agent: %s", exc)
+            raise
+    return _agent
+
+
+def chat(message: str, session_id: Optional[str] = None) -> str:
+    """Send a message to Timmy and get a response.
+
+    Uses a persistent agent and session_id so Agno's SQLite history
+    provides multi-turn conversation context.
+
+    Args:
+        message:    The user's message.
+        session_id: Optional session identifier (defaults to "dashboard").
+
+    Returns:
+        The agent's response text.
+    """
+    sid = session_id or _DEFAULT_SESSION_ID
+    agent = _get_agent()
+
+    # Pre-processing: extract user facts
+    _extract_facts(message)
+
+    # Run with session_id so Agno retrieves history from SQLite
+    run = agent.run(message, stream=False, session_id=sid)
+    response_text = run.content if hasattr(run, "content") else str(run)
+
+    # Post-processing: clean up any leaked tool calls or chain-of-thought
+    response_text = _clean_response(response_text)
+
+    return response_text
+
+
+def reset_session(session_id: Optional[str] = None) -> None:
+    """Reset a session (clear conversation context).
+
+    This clears the ConversationManager state. Agno's SQLite history
+    is not cleared — that provides long-term continuity.
+    """
+    sid = session_id or _DEFAULT_SESSION_ID
+    try:
+        from timmy.conversation import conversation_manager
+        conversation_manager.clear_context(sid)
+    except Exception:
+        pass  # Graceful degradation
+
+
+def _extract_facts(message: str) -> None:
+    """Extract user facts from message and persist to memory system.
+
+    Ported from TimmyWithMemory._extract_and_store_facts().
+    Runs as a best-effort post-processor — failures are logged, not raised.
+    """
+    try:
+        from timmy.conversation import conversation_manager
+        name = conversation_manager.extract_user_name(message)
+        if name:
+            try:
+                from timmy.memory_system import memory_system
+                memory_system.update_user_fact("Name", name)
+                logger.info("Session: Learned user name: %s", name)
+            except Exception:
+                pass
+    except Exception as exc:
+        logger.debug("Session: Fact extraction skipped: %s", exc)
+
+
+def _clean_response(text: str) -> str:
+    """Remove hallucinated tool calls and chain-of-thought narration.
+
+    Small models sometimes output raw JSON tool calls or narrate their
+    internal reasoning instead of just answering. This strips those
+    artifacts from the response.
+    """
+    if not text:
+        return text
+
+    # Strip JSON tool call blocks
+    text = _TOOL_CALL_JSON.sub("", text)
+
+    # Strip function-call-style text
+    text = _FUNC_CALL_TEXT.sub("", text)
+
+    # Strip chain-of-thought narration lines
+    for pattern in _COT_PATTERNS:
+        text = pattern.sub("", text)
+
+    # Clean up leftover blank lines and whitespace
+    lines = [line for line in text.split("\n") if line.strip()]
+    text = "\n".join(lines)
+
+    return text.strip()
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -79,7 +79,9 @@ def test_create_timmy_embeds_system_prompt():

        kwargs = MockAgent.call_args.kwargs
        # Prompt should contain base system prompt (may have memory context appended)
-        assert kwargs["description"].startswith(TIMMY_SYSTEM_PROMPT[:100])
+        # Default model (llama3.2) uses the lite prompt
+        assert "Timmy" in kwargs["description"]
+        assert "sovereign" in kwargs["description"]


 # ── Ollama host regression (container connectivity) ─────────────────────────
@@ -194,3 +196,85 @@ def test_resolve_backend_auto_falls_back_on_non_apple():

        from timmy.agent import _resolve_backend
        assert _resolve_backend(None) == "ollama"
+
+
+# ── _model_supports_tools ────────────────────────────────────────────────────
+
+def test_model_supports_tools_llama32_returns_false():
+    """llama3.2 (3B) is too small for reliable tool calling."""
+    from timmy.agent import _model_supports_tools
+    assert _model_supports_tools("llama3.2") is False
+    assert _model_supports_tools("llama3.2:latest") is False
+
+
+def test_model_supports_tools_llama31_returns_true():
+    """llama3.1 (8B+) can handle tool calling."""
+    from timmy.agent import _model_supports_tools
+    assert _model_supports_tools("llama3.1") is True
+    assert _model_supports_tools("llama3.3") is True
+
+
+def test_model_supports_tools_other_small_models():
+    """Other known small models should not get tools."""
+    from timmy.agent import _model_supports_tools
+    assert _model_supports_tools("phi-3") is False
+    assert _model_supports_tools("tinyllama") is False
+
+
+def test_model_supports_tools_unknown_model_gets_tools():
+    """Unknown models default to tool-capable (optimistic)."""
+    from timmy.agent import _model_supports_tools
+    assert _model_supports_tools("mistral") is True
+    assert _model_supports_tools("qwen2.5:72b") is True
+
+
+# ── Tool gating in create_timmy ──────────────────────────────────────────────
+
+def test_create_timmy_no_tools_for_small_model():
+    """llama3.2 should get no tools."""
+    with patch("timmy.agent.Agent") as MockAgent, \
+         patch("timmy.agent.Ollama"), \
+         patch("timmy.agent.SqliteDb"):
+
+        from timmy.agent import create_timmy
+        create_timmy()
+
+        kwargs = MockAgent.call_args.kwargs
+        # Default model is llama3.2 → tools should be None
+        assert kwargs["tools"] is None
+
+
+def test_create_timmy_includes_tools_for_large_model():
+    """A tool-capable model (e.g. llama3.1) should attempt to include tools."""
+    mock_toolkit = MagicMock()
+
+    with patch("timmy.agent.Agent") as MockAgent, \
+         patch("timmy.agent.Ollama"), \
+         patch("timmy.agent.SqliteDb"), \
+         patch("timmy.agent.create_full_toolkit", return_value=mock_toolkit), \
+         patch("timmy.agent.settings") as mock_settings:
+
+        mock_settings.ollama_model = "llama3.1"
+        mock_settings.ollama_url = "http://localhost:11434"
+        mock_settings.timmy_model_backend = "ollama"
+        mock_settings.airllm_model_size = "70b"
+        mock_settings.telemetry_enabled = False
+
+        from timmy.agent import create_timmy
+        create_timmy()
+
+        kwargs = MockAgent.call_args.kwargs
+        assert kwargs["tools"] == [mock_toolkit]
+
+
+def test_create_timmy_show_tool_calls_false():
+    """show_tool_calls should always be False to prevent raw JSON in output."""
+    with patch("timmy.agent.Agent") as MockAgent, \
+         patch("timmy.agent.Ollama"), \
+         patch("timmy.agent.SqliteDb"):
+
+        from timmy.agent import create_timmy
+        create_timmy()
+
+        kwargs = MockAgent.call_args.kwargs
+        assert kwargs["show_tool_calls"] is False
--- a/tests/test_dashboard.py
+++ b/tests/test_dashboard.py
@@ -1,4 +1,4 @@
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, patch


 # ── Index ─────────────────────────────────────────────────────────────────────
@@ -74,12 +74,7 @@ def test_agents_list_timmy_metadata(client):
 # ── Chat ──────────────────────────────────────────────────────────────────────

 def test_chat_timmy_success(client):
-    mock_agent = MagicMock()
-    mock_run = MagicMock()
-    mock_run.content = "I am Timmy, operational and sovereign."
-    mock_agent.run.return_value = mock_run
-
-    with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
+    with patch("dashboard.routes.agents.timmy_chat", return_value="I am Timmy, operational and sovereign."):
        response = client.post("/agents/timmy/chat", data={"message": "status?"})

    assert response.status_code == 200
@@ -88,17 +83,14 @@ def test_chat_timmy_success(client):


 def test_chat_timmy_shows_user_message(client):
-    mock_agent = MagicMock()
-    mock_agent.run.return_value = MagicMock(content="Acknowledged.")
-
-    with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
+    with patch("dashboard.routes.agents.timmy_chat", return_value="Acknowledged."):
        response = client.post("/agents/timmy/chat", data={"message": "hello there"})

    assert "hello there" in response.text


 def test_chat_timmy_ollama_offline(client):
-    with patch("dashboard.routes.agents.create_timmy", side_effect=Exception("connection refused")):
+    with patch("dashboard.routes.agents.timmy_chat", side_effect=Exception("connection refused")):
        response = client.post("/agents/timmy/chat", data={"message": "ping"})

    assert response.status_code == 200
@@ -120,10 +112,7 @@ def test_history_empty_shows_init_message(client):


 def test_history_records_user_and_agent_messages(client):
-    mock_agent = MagicMock()
-    mock_agent.run.return_value = MagicMock(content="I am operational.")
-
-    with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
+    with patch("dashboard.routes.agents.timmy_chat", return_value="I am operational."):
        client.post("/agents/timmy/chat", data={"message": "status check"})

    response = client.get("/agents/timmy/history")
@@ -132,7 +121,7 @@ def test_history_records_user_and_agent_messages(client):


 def test_history_records_error_when_offline(client):
-    with patch("dashboard.routes.agents.create_timmy", side_effect=Exception("refused")):
+    with patch("dashboard.routes.agents.timmy_chat", side_effect=Exception("refused")):
        client.post("/agents/timmy/chat", data={"message": "ping"})

    response = client.get("/agents/timmy/history")
@@ -141,10 +130,7 @@ def test_history_records_error_when_offline(client):


 def test_history_clear_resets_to_init_message(client):
-    mock_agent = MagicMock()
-    mock_agent.run.return_value = MagicMock(content="Acknowledged.")
-
-    with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
+    with patch("dashboard.routes.agents.timmy_chat", return_value="Acknowledged."):
        client.post("/agents/timmy/chat", data={"message": "hello"})

    response = client.delete("/agents/timmy/history")
@@ -153,10 +139,7 @@ def test_history_clear_resets_to_init_message(client):


 def test_history_empty_after_clear(client):
-    mock_agent = MagicMock()
-    mock_agent.run.return_value = MagicMock(content="OK.")
-
-    with patch("dashboard.routes.agents.create_timmy", return_value=mock_agent):
+    with patch("dashboard.routes.agents.timmy_chat", return_value="OK."):
        client.post("/agents/timmy/chat", data={"message": "test"})

    client.delete("/agents/timmy/history")
--- a/tests/test_session.py
+++ b/tests/test_session.py
@@ -0,0 +1,180 @@
+"""Tests for timmy.session — persistent chat session with response sanitization."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(autouse=True)
+def _reset_session_singleton():
+    """Reset the module-level singleton between tests."""
+    import timmy.session as mod
+    mod._agent = None
+    yield
+    mod._agent = None
+
+
+# ---------------------------------------------------------------------------
+# chat()
+# ---------------------------------------------------------------------------
+
+def test_chat_returns_string():
+    """chat() should return a plain string response."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="Hello, sir.")
+
+    with patch("timmy.session._get_agent", return_value=mock_agent):
+        from timmy.session import chat
+        result = chat("Hi Timmy")
+
+    assert isinstance(result, str)
+    assert "Hello, sir." in result
+
+
+def test_chat_passes_session_id():
+    """chat() should pass the session_id to agent.run()."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="OK.")
+
+    with patch("timmy.session._get_agent", return_value=mock_agent):
+        from timmy.session import chat
+        chat("test", session_id="my-session")
+
+    _, kwargs = mock_agent.run.call_args
+    assert kwargs["session_id"] == "my-session"
+
+
+def test_chat_uses_default_session_id():
+    """chat() should use 'dashboard' as the default session_id."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="OK.")
+
+    with patch("timmy.session._get_agent", return_value=mock_agent):
+        from timmy.session import chat
+        chat("test")
+
+    _, kwargs = mock_agent.run.call_args
+    assert kwargs["session_id"] == "dashboard"
+
+
+def test_chat_singleton_agent_reused():
+    """Calling chat() multiple times should reuse the same agent instance."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="OK.")
+
+    with patch("timmy.agent.create_timmy", return_value=mock_agent) as mock_factory:
+        from timmy.session import chat
+        chat("first message")
+        chat("second message")
+
+    # Factory called only once (singleton)
+    mock_factory.assert_called_once()
+
+
+def test_chat_extracts_user_name():
+    """chat() should extract user name from message and persist to memory."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="Nice to meet you!")
+
+    mock_mem = MagicMock()
+
+    with patch("timmy.session._get_agent", return_value=mock_agent), \
+         patch("timmy.memory_system.memory_system", mock_mem):
+        from timmy.session import chat
+        chat("my name is Alex")
+
+    mock_mem.update_user_fact.assert_called_once_with("Name", "Alex")
+
+
+def test_chat_graceful_degradation_on_memory_failure():
+    """chat() should still work if the conversation manager raises."""
+    mock_agent = MagicMock()
+    mock_agent.run.return_value = MagicMock(content="I'm operational.")
+
+    with patch("timmy.session._get_agent", return_value=mock_agent), \
+         patch("timmy.conversation.conversation_manager") as mock_cm:
+        mock_cm.extract_user_name.side_effect = Exception("memory broken")
+
+        from timmy.session import chat
+        result = chat("test message")
+
+    assert "operational" in result
+
+
+# ---------------------------------------------------------------------------
+# _clean_response()
+# ---------------------------------------------------------------------------
+
+def test_clean_response_strips_json_tool_calls():
+    """JSON tool call blocks should be removed from response text."""
+    from timmy.session import _clean_response
+
+    dirty = 'Here is the answer. {"name": "python", "parameters": {"code": "0.15 * 3847.23", "variable_to_return": "result"}} The result is 577.'
+    clean = _clean_response(dirty)
+
+    assert '{"name"' not in clean
+    assert '"parameters"' not in clean
+    assert "The result is 577." in clean
+
+
+def test_clean_response_strips_function_calls():
+    """Function-call-style text should be removed."""
+    from timmy.session import _clean_response
+
+    dirty = 'I will search for that. memory_search(query="recall number") Found nothing.'
+    clean = _clean_response(dirty)
+
+    assert "memory_search(" not in clean
+    assert "Found nothing." in clean
+
+
+def test_clean_response_strips_chain_of_thought():
+    """Chain-of-thought narration lines should be removed."""
+    from timmy.session import _clean_response
+
+    dirty = """Since there's no direct answer in my vault or hot memory, I'll use memory_search.
+Using memory_search(query="what is special"), I found a context.
+Here's a possible response:
+77 is special because it's a prime number."""
+    clean = _clean_response(dirty)
+
+    assert "Since there's no" not in clean
+    assert "Here's a possible" not in clean
+    assert "77 is special" in clean
+
+
+def test_clean_response_preserves_normal_text():
+    """Normal text without tool artifacts should pass through unchanged."""
+    from timmy.session import _clean_response
+
+    normal = "The number 77 is the sum of the first seven primes: 2+3+5+7+11+13+17."
+    assert _clean_response(normal) == normal
+
+
+def test_clean_response_handles_empty_string():
+    """Empty string should be returned as-is."""
+    from timmy.session import _clean_response
+    assert _clean_response("") == ""
+
+
+def test_clean_response_handles_none():
+    """None should be returned as-is."""
+    from timmy.session import _clean_response
+    assert _clean_response(None) is None
+
+
+# ---------------------------------------------------------------------------
+# reset_session()
+# ---------------------------------------------------------------------------
+
+def test_reset_session_clears_context():
+    """reset_session() should clear the conversation context."""
+    with patch("timmy.conversation.conversation_manager") as mock_cm:
+        from timmy.session import reset_session
+        reset_session("test-session")
+
+    mock_cm.clear_context.assert_called_once_with("test-session")