fix: chat evaluation bugs — task pipeline, prompt grounding, markdown rendering

Addresses 14 bugs from 3 rounds of deep chat evaluation: - Add chat-to-task pipeline in agents.py with regex-based intent detection, agent extraction, priority extraction, and title cleaning - Filter meta-questions ("how do I create a task?") from task creation - Inject real-time date/time context into every chat message - Inject live queue state when user asks about tasks - Ground system prompts with agent roster, honesty guardrails, self-knowledge, math delegation template, anti-filler rules, values-conflict guidance - Add CSS for markdown code blocks, inline code, lists, blockquotes in chat - Add highlight.js CDN for syntax highlighting in chat responses - Reduce small-model memory context budget (4000→2000) for expanded prompt - Add 27 comprehensive tests covering the full chat-to-task pipeline Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 11:42:42 -05:00
parent 6c6b6f8a54
commit 3ca8e9f2d6
7 changed files with 526 additions and 10 deletions
--- a/src/dashboard/routes/agents.py
+++ b/src/dashboard/routes/agents.py
@@ -1,3 +1,5 @@
+import logging
+import re
 from datetime import datetime
 from pathlib import Path

@@ -8,9 +10,144 @@ from fastapi.templating import Jinja2Templates
 from timmy.session import chat as timmy_chat
 from dashboard.store import message_log

+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/agents", tags=["agents"])
 templates = Jinja2Templates(directory=str(Path(__file__).parent.parent / "templates"))

+# ── Task queue detection ──────────────────────────────────────────────────
+# Patterns that indicate the user wants to queue a task rather than chat
+_QUEUE_PATTERNS = [
+    re.compile(r"\b(?:add|put|schedule|queue|submit)\b.*\b(?:to the|on the|in the)?\s*(?:queue|task(?:\s*queue)?|task list)\b", re.IGNORECASE),
+    re.compile(r"\bschedule\s+(?:this|that|a)\b", re.IGNORECASE),
+    re.compile(r"\bcreate\s+(?:a\s+|an\s+)?(?:\w+\s+){0,3}task\b", re.IGNORECASE),
+]
+# Questions about tasks/queue should NOT trigger task creation
+_QUESTION_PREFIXES = re.compile(
+    r"^(?:what|how|why|can you explain|could you explain|tell me about|explain|"
+    r"what(?:'s| is| are| would))\b",
+    re.IGNORECASE,
+)
+_QUESTION_FRAMES = re.compile(
+    r"\b(?:how (?:do|does|would|can|should)|what (?:is|are|would)|"
+    r"can you (?:explain|describe|tell)|best way to)\b",
+    re.IGNORECASE,
+)
+
+# Known agent names for task assignment parsing
+_KNOWN_AGENTS = frozenset({
+    "timmy", "echo", "mace", "helm", "seer",
+    "forge", "quill", "pixel", "lyra", "reel",
+})
+_AGENT_PATTERN = re.compile(
+    r"\bfor\s+(" + "|".join(_KNOWN_AGENTS) + r")\b", re.IGNORECASE
+)
+
+# Priority keywords → task priority mapping
+_PRIORITY_MAP = {
+    "urgent": "urgent",
+    "critical": "urgent",
+    "asap": "urgent",
+    "emergency": "urgent",
+    "high priority": "high",
+    "high-priority": "high",
+    "important": "high",
+    "low priority": "low",
+    "low-priority": "low",
+    "minor": "low",
+}
+
+# Queue context detection
+_QUEUE_QUERY_PATTERN = re.compile(
+    r"\b(?:task(?:s|\s+queue)?|queue|what(?:'s| is) (?:in |on )?(?:the )?queue)\b",
+    re.IGNORECASE,
+)
+
+
+def _extract_agent_from_message(message: str) -> str:
+    """Extract target agent name from message, defaulting to 'timmy'."""
+    m = _AGENT_PATTERN.search(message)
+    if m:
+        return m.group(1).lower()
+    return "timmy"
+
+
+def _extract_priority_from_message(message: str) -> str:
+    """Extract priority level from message, defaulting to 'normal'."""
+    msg_lower = message.lower()
+    for keyword, priority in sorted(_PRIORITY_MAP.items(), key=lambda x: -len(x[0])):
+        if keyword in msg_lower:
+            return priority
+    return "normal"
+
+
+def _extract_task_from_message(message: str) -> dict | None:
+    """If the message looks like a task-queue request, return task details.
+
+    Returns None for meta-questions about tasks (e.g. "how do I create a task?").
+    """
+    if _QUESTION_PREFIXES.search(message) or _QUESTION_FRAMES.search(message):
+        return None
+    for pattern in _QUEUE_PATTERNS:
+        if pattern.search(message):
+            # Strip the queue instruction to get the actual task description
+            title = re.sub(
+                r"\b(?:add|put|schedule|queue|submit|create)\b.*?\b(?:to the|on the|in the|an?)?(?:\s+\w+){0,3}\s*(?:queue|task(?:\s*queue)?|task list)\b",
+                "", message, flags=re.IGNORECASE,
+            ).strip(" ,:;-")
+            # Strip "for {agent}" from title
+            title = _AGENT_PATTERN.sub("", title).strip(" ,:;-")
+            # Strip priority keywords from title
+            title = re.sub(
+                r"\b(?:urgent|critical|asap|emergency|high[- ]priority|important|low[- ]priority|minor)\b",
+                "", title, flags=re.IGNORECASE,
+            ).strip(" ,:;-")
+            # Strip leading "to " that often remains
+            title = re.sub(r"^to\s+", "", title, flags=re.IGNORECASE).strip()
+            # Clean up double spaces
+            title = re.sub(r"\s{2,}", " ", title).strip()
+            # Fallback to full message if stripping removed everything
+            if not title or len(title) < 5:
+                title = message
+            # Capitalize first letter
+            title = title[0].upper() + title[1:] if title else title
+            agent = _extract_agent_from_message(message)
+            priority = _extract_priority_from_message(message)
+            return {
+                "title": title[:120],
+                "description": message,
+                "agent": agent,
+                "priority": priority,
+            }
+    return None
+
+
+def _build_queue_context() -> str:
+    """Build a concise task queue summary for context injection."""
+    try:
+        from task_queue.models import get_counts_by_status, list_tasks, TaskStatus
+        counts = get_counts_by_status()
+        pending = counts.get("pending_approval", 0)
+        running = counts.get("running", 0)
+        completed = counts.get("completed", 0)
+
+        parts = [f"[System: Task queue — {pending} pending approval, {running} running, {completed} completed."]
+        if pending > 0:
+            tasks = list_tasks(status=TaskStatus.PENDING_APPROVAL, limit=5)
+            if tasks:
+                items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
+                parts.append(f"Pending: {items}.")
+        if running > 0:
+            tasks = list_tasks(status=TaskStatus.RUNNING, limit=5)
+            if tasks:
+                items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
+                parts.append(f"Running: {items}.")
+        return " ".join(parts) + "]"
+    except Exception as exc:
+        logger.debug("Failed to build queue context: %s", exc)
+        return ""
+
+
 # Static metadata for known agents — enriched onto live registry entries.
 _AGENT_METADATA: dict[str, dict] = {
    "timmy": {
@@ -74,10 +211,46 @@ async def chat_timmy(request: Request, message: str = Form(...)):
    response_text = None
    error_text = None

-    try:
-        response_text = timmy_chat(message)
-    except Exception as exc:
-        error_text = f"Timmy is offline: {exc}"
+    # Check if the user wants to queue a task instead of chatting
+    task_info = _extract_task_from_message(message)
+    if task_info:
+        try:
+            from task_queue.models import create_task
+            task = create_task(
+                title=task_info["title"],
+                description=task_info["description"],
+                created_by="user",
+                assigned_to=task_info.get("agent", "timmy"),
+                priority=task_info.get("priority", "normal"),
+                requires_approval=True,
+            )
+            priority_label = f" | Priority: `{task.priority.value}`" if task.priority.value != "normal" else ""
+            response_text = (
+                f"Task queued for approval: **{task.title}**\n\n"
+                f"Assigned to: `{task.assigned_to}`{priority_label} | "
+                f"Status: `{task.status.value}` | "
+                f"[View Task Queue](/tasks)"
+            )
+            logger.info("Chat → task queue: %s → %s (id=%s)", task.title, task.assigned_to, task.id)
+        except Exception as exc:
+            logger.error("Failed to create task from chat: %s", exc)
+            task_info = None
+
+    # Normal chat path (also used as fallback if task creation failed)
+    if not task_info:
+        try:
+            now = datetime.now()
+            context_parts = [
+                f"[System: Current date/time is {now.strftime('%A, %B %d, %Y at %I:%M %p')}]"
+            ]
+            if _QUEUE_QUERY_PATTERN.search(message):
+                queue_ctx = _build_queue_context()
+                if queue_ctx:
+                    context_parts.append(queue_ctx)
+            context_prefix = "\n".join(context_parts) + "\n\n"
+            response_text = timmy_chat(context_prefix + message)
+        except Exception as exc:
+            error_text = f"Timmy is offline: {exc}"

    message_log.append(role="user", content=message, timestamp=timestamp)
    if response_text is not None:
--- a/src/dashboard/templates/base.html
+++ b/src/dashboard/templates/base.html
@@ -16,6 +16,8 @@
  <script src="https://unpkg.com/htmx.org@2.0.3" integrity="sha384-0895/pl2MU10Hqc6jd4RvrthNlDiE9U1tWmX7WRESftEDRosgxNsQG/Ze9YMRzHq" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/marked@15.0.7/marked.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/dompurify@3.2.4/dist/purify.min.js"></script>
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/styles/github-dark.min.css" />
+  <script src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/highlight.min.js"></script>
 </head>
 <body>
  <header class="mc-header">
--- a/src/dashboard/templates/partials/chat_message.html
+++ b/src/dashboard/templates/partials/chat_message.html
@@ -12,6 +12,9 @@
    var el = document.currentScript.previousElementSibling.querySelector('.timmy-md');
    if (el && typeof marked !== 'undefined' && typeof DOMPurify !== 'undefined') {
      el.innerHTML = DOMPurify.sanitize(marked.parse(el.textContent));
+      if (typeof hljs !== 'undefined') {
+        el.querySelectorAll('pre code').forEach(function(block) { hljs.highlightElement(block); });
+      }
    }
  })();
 </script>
--- a/src/timmy/agent.py
+++ b/src/timmy/agent.py
@@ -118,8 +118,9 @@ def create_timmy(
        from timmy.memory_system import memory_system
        memory_context = memory_system.get_system_context()
        if memory_context:
-            # Truncate if too long (keep under token limit)
-            max_context = 4000 if not use_tools else 8000
+            # Truncate if too long — smaller budget for small models
+            # since the expanded prompt (roster, guardrails) uses more tokens
+            max_context = 2000 if not use_tools else 8000
            if len(memory_context) > max_context:
                memory_context = memory_context[:max_context] + "\n... [truncated]"
            full_prompt = f"{base_prompt}\n\n## Memory Context\n\n{memory_context}"
--- a/src/timmy/prompts.py
+++ b/src/timmy/prompts.py
@@ -10,6 +10,8 @@ tool usage guidelines and memory system documentation.
 # ---------------------------------------------------------------------------

 TIMMY_SYSTEM_PROMPT_LITE = """You are Timmy — a sovereign AI agent running locally on this Mac.
+You run on the llama3.2 model via Ollama on localhost. You are not GPT, not Claude,
+not a custom model — you are llama3.2 wrapped in the Timmy agent framework.
 No cloud dependencies. Think clearly, speak plainly, act with intention.
 Grounded in Christian faith, powered by Bitcoin economics, committed to the
 user's digital sovereignty.
@@ -19,14 +21,42 @@ Rules:
 - Never mention tools, memory_search, vaults, or internal systems to the user.
 - Never output tool calls, JSON, or function syntax in your responses.
 - Remember what the user tells you during our conversation.
- If you don't know something, say so honestly.
+- If you don't know something, say so honestly — never fabricate facts.
+- If a request is ambiguous, ask a brief clarifying question before guessing.
 - Use the user's name if you know it.
 - When you state a fact, commit to it. Never contradict a correct statement you
  just made in the same response. If uncertain, express uncertainty at the start —
  never state something confidently and then immediately undermine it.
- NEVER attempt arithmetic in your head — LLMs are unreliable at multi-digit math.
-  If asked to compute anything (multiply, divide, square root, exponents, etc.),
-  tell the user you need a calculator tool to give an exact answer.
+- NEVER attempt arithmetic in your head. If asked to compute anything, respond:
+  "I'm not reliable at math without a calculator tool — let me know if you'd
+  like me to walk through the logic instead."
+- Do NOT end responses with generic chatbot phrases like "I'm here to help" or
+  "feel free to ask." Stay in character.
+- When your values conflict (e.g. honesty vs. helpfulness), lead with honesty.
+  Acknowledge the tension openly rather than defaulting to generic agreeableness.
+
+## Agent Roster (complete — no others exist)
+- Timmy: core sovereign AI (you)
+- Echo: research, summarization, fact-checking
+- Mace: security, monitoring, threat-analysis
+- Forge: coding, debugging, testing
+- Seer: analytics, visualization, prediction
+- Helm: devops, automation, configuration
+- Quill: writing, editing, documentation
+- Pixel: image-generation, storyboard, design
+- Lyra: music-generation, vocals, composition
+- Reel: video-generation, animation, motion
+Do NOT invent agents not listed here. If asked about an unlisted agent, say it doesn't exist.
+Use ONLY the capabilities listed above when describing agents — do not embellish or invent.
+
+## What you CAN and CANNOT access
+- You CANNOT query the live task queue, agent statuses, or system metrics on your own.
+- You CANNOT access real-time data without tools.
+- If asked about current tasks, agent status, or system state and no system context
+  is provided, say "I don't have live access to that — check the dashboard."
+- Your conversation history persists in a database across requests, but the
+  dashboard chat display resets on server restart.
+- Do NOT claim abilities you don't have. When uncertain, say "I don't know."

 Sir, affirmative."""

@@ -35,6 +65,8 @@ Sir, affirmative."""
 # ---------------------------------------------------------------------------

 TIMMY_SYSTEM_PROMPT_FULL = """You are Timmy — a sovereign AI agent running locally on this Mac.
+You run on the llama3.2 model via Ollama on localhost. You are not GPT, not Claude,
+not a custom model — you are llama3.2 wrapped in the Timmy agent framework.
 No cloud dependencies. You think clearly, speak plainly, act with intention.
 Grounded in Christian faith, powered by Bitcoin economics, committed to the
 user's digital sovereignty.
@@ -57,6 +89,28 @@ user's digital sovereignty.
 - Similarity-based retrieval
 - Use `memory_search` tool to find relevant past context

+## Agent Roster (complete — no others exist)
+- Timmy: core sovereign AI (you)
+- Echo: research, summarization, fact-checking
+- Mace: security, monitoring, threat-analysis
+- Forge: coding, debugging, testing
+- Seer: analytics, visualization, prediction
+- Helm: devops, automation, configuration
+- Quill: writing, editing, documentation
+- Pixel: image-generation, storyboard, design
+- Lyra: music-generation, vocals, composition
+- Reel: video-generation, animation, motion
+Do NOT invent agents not listed here. If asked about an unlisted agent, say it doesn't exist.
+Use ONLY the capabilities listed above when describing agents — do not embellish or invent.
+
+## What you CAN and CANNOT access
+- You CANNOT query the live task queue, agent statuses, or system metrics on your own.
+- If asked about current tasks, agent status, or system state and no system context
+  is provided, say "I don't have live access to that — check the dashboard."
+- Your conversation history persists in a database across requests, but the
+  dashboard chat display resets on server restart.
+- Do NOT claim abilities you don't have. When uncertain, say "I don't know."
+
 ## Tool Usage Guidelines

 ### When NOT to use tools:
@@ -81,9 +135,13 @@ user's digital sovereignty.
 - Never narrate your reasoning process. Just give the answer.
 - Never show raw tool call JSON or function syntax in responses.
 - Use the user's name if known.
+- If a request is ambiguous, ask a brief clarifying question before guessing.
 - When you state a fact, commit to it. Never contradict a correct statement you
  just made in the same response. If uncertain, express uncertainty at the start —
  never state something confidently and then immediately undermine it.
+- Do NOT end responses with generic chatbot phrases like "I'm here to help" or
+  "feel free to ask." Stay in character.
+- When your values conflict (e.g. honesty vs. helpfulness), lead with honesty.

 Sir, affirmative."""