fix: chat evaluation bugs — task pipeline, prompt grounding, markdown rendering

Addresses 14 bugs from 3 rounds of deep chat evaluation:

- Add chat-to-task pipeline in agents.py with regex-based intent detection,
  agent extraction, priority extraction, and title cleaning
- Filter meta-questions ("how do I create a task?") from task creation
- Inject real-time date/time context into every chat message
- Inject live queue state when user asks about tasks
- Ground system prompts with agent roster, honesty guardrails, self-knowledge,
  math delegation template, anti-filler rules, values-conflict guidance
- Add CSS for markdown code blocks, inline code, lists, blockquotes in chat
- Add highlight.js CDN for syntax highlighting in chat responses
- Reduce small-model memory context budget (4000→2000) for expanded prompt
- Add 27 comprehensive tests covering the full chat-to-task pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alexander Payne
2026-02-26 11:42:42 -05:00
parent 6c6b6f8a54
commit 3ca8e9f2d6
7 changed files with 526 additions and 10 deletions

View File

@@ -1,3 +1,5 @@
import logging
import re
from datetime import datetime
from pathlib import Path
@@ -8,9 +10,144 @@ from fastapi.templating import Jinja2Templates
from timmy.session import chat as timmy_chat
from dashboard.store import message_log
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/agents", tags=["agents"])
templates = Jinja2Templates(directory=str(Path(__file__).parent.parent / "templates"))
# ── Task queue detection ──────────────────────────────────────────────────
# Patterns that indicate the user wants to queue a task rather than chat
_QUEUE_PATTERNS = [
re.compile(r"\b(?:add|put|schedule|queue|submit)\b.*\b(?:to the|on the|in the)?\s*(?:queue|task(?:\s*queue)?|task list)\b", re.IGNORECASE),
re.compile(r"\bschedule\s+(?:this|that|a)\b", re.IGNORECASE),
re.compile(r"\bcreate\s+(?:a\s+|an\s+)?(?:\w+\s+){0,3}task\b", re.IGNORECASE),
]
# Questions about tasks/queue should NOT trigger task creation
_QUESTION_PREFIXES = re.compile(
r"^(?:what|how|why|can you explain|could you explain|tell me about|explain|"
r"what(?:'s| is| are| would))\b",
re.IGNORECASE,
)
_QUESTION_FRAMES = re.compile(
r"\b(?:how (?:do|does|would|can|should)|what (?:is|are|would)|"
r"can you (?:explain|describe|tell)|best way to)\b",
re.IGNORECASE,
)
# Known agent names for task assignment parsing
_KNOWN_AGENTS = frozenset({
"timmy", "echo", "mace", "helm", "seer",
"forge", "quill", "pixel", "lyra", "reel",
})
_AGENT_PATTERN = re.compile(
r"\bfor\s+(" + "|".join(_KNOWN_AGENTS) + r")\b", re.IGNORECASE
)
# Priority keywords → task priority mapping
_PRIORITY_MAP = {
"urgent": "urgent",
"critical": "urgent",
"asap": "urgent",
"emergency": "urgent",
"high priority": "high",
"high-priority": "high",
"important": "high",
"low priority": "low",
"low-priority": "low",
"minor": "low",
}
# Queue context detection
_QUEUE_QUERY_PATTERN = re.compile(
r"\b(?:task(?:s|\s+queue)?|queue|what(?:'s| is) (?:in |on )?(?:the )?queue)\b",
re.IGNORECASE,
)
def _extract_agent_from_message(message: str) -> str:
"""Extract target agent name from message, defaulting to 'timmy'."""
m = _AGENT_PATTERN.search(message)
if m:
return m.group(1).lower()
return "timmy"
def _extract_priority_from_message(message: str) -> str:
"""Extract priority level from message, defaulting to 'normal'."""
msg_lower = message.lower()
for keyword, priority in sorted(_PRIORITY_MAP.items(), key=lambda x: -len(x[0])):
if keyword in msg_lower:
return priority
return "normal"
def _extract_task_from_message(message: str) -> dict | None:
"""If the message looks like a task-queue request, return task details.
Returns None for meta-questions about tasks (e.g. "how do I create a task?").
"""
if _QUESTION_PREFIXES.search(message) or _QUESTION_FRAMES.search(message):
return None
for pattern in _QUEUE_PATTERNS:
if pattern.search(message):
# Strip the queue instruction to get the actual task description
title = re.sub(
r"\b(?:add|put|schedule|queue|submit|create)\b.*?\b(?:to the|on the|in the|an?)?(?:\s+\w+){0,3}\s*(?:queue|task(?:\s*queue)?|task list)\b",
"", message, flags=re.IGNORECASE,
).strip(" ,:;-")
# Strip "for {agent}" from title
title = _AGENT_PATTERN.sub("", title).strip(" ,:;-")
# Strip priority keywords from title
title = re.sub(
r"\b(?:urgent|critical|asap|emergency|high[- ]priority|important|low[- ]priority|minor)\b",
"", title, flags=re.IGNORECASE,
).strip(" ,:;-")
# Strip leading "to " that often remains
title = re.sub(r"^to\s+", "", title, flags=re.IGNORECASE).strip()
# Clean up double spaces
title = re.sub(r"\s{2,}", " ", title).strip()
# Fallback to full message if stripping removed everything
if not title or len(title) < 5:
title = message
# Capitalize first letter
title = title[0].upper() + title[1:] if title else title
agent = _extract_agent_from_message(message)
priority = _extract_priority_from_message(message)
return {
"title": title[:120],
"description": message,
"agent": agent,
"priority": priority,
}
return None
def _build_queue_context() -> str:
"""Build a concise task queue summary for context injection."""
try:
from task_queue.models import get_counts_by_status, list_tasks, TaskStatus
counts = get_counts_by_status()
pending = counts.get("pending_approval", 0)
running = counts.get("running", 0)
completed = counts.get("completed", 0)
parts = [f"[System: Task queue — {pending} pending approval, {running} running, {completed} completed."]
if pending > 0:
tasks = list_tasks(status=TaskStatus.PENDING_APPROVAL, limit=5)
if tasks:
items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
parts.append(f"Pending: {items}.")
if running > 0:
tasks = list_tasks(status=TaskStatus.RUNNING, limit=5)
if tasks:
items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
parts.append(f"Running: {items}.")
return " ".join(parts) + "]"
except Exception as exc:
logger.debug("Failed to build queue context: %s", exc)
return ""
# Static metadata for known agents — enriched onto live registry entries.
_AGENT_METADATA: dict[str, dict] = {
"timmy": {
@@ -74,10 +211,46 @@ async def chat_timmy(request: Request, message: str = Form(...)):
response_text = None
error_text = None
try:
response_text = timmy_chat(message)
except Exception as exc:
error_text = f"Timmy is offline: {exc}"
# Check if the user wants to queue a task instead of chatting
task_info = _extract_task_from_message(message)
if task_info:
try:
from task_queue.models import create_task
task = create_task(
title=task_info["title"],
description=task_info["description"],
created_by="user",
assigned_to=task_info.get("agent", "timmy"),
priority=task_info.get("priority", "normal"),
requires_approval=True,
)
priority_label = f" | Priority: `{task.priority.value}`" if task.priority.value != "normal" else ""
response_text = (
f"Task queued for approval: **{task.title}**\n\n"
f"Assigned to: `{task.assigned_to}`{priority_label} | "
f"Status: `{task.status.value}` | "
f"[View Task Queue](/tasks)"
)
logger.info("Chat → task queue: %s%s (id=%s)", task.title, task.assigned_to, task.id)
except Exception as exc:
logger.error("Failed to create task from chat: %s", exc)
task_info = None
# Normal chat path (also used as fallback if task creation failed)
if not task_info:
try:
now = datetime.now()
context_parts = [
f"[System: Current date/time is {now.strftime('%A, %B %d, %Y at %I:%M %p')}]"
]
if _QUEUE_QUERY_PATTERN.search(message):
queue_ctx = _build_queue_context()
if queue_ctx:
context_parts.append(queue_ctx)
context_prefix = "\n".join(context_parts) + "\n\n"
response_text = timmy_chat(context_prefix + message)
except Exception as exc:
error_text = f"Timmy is offline: {exc}"
message_log.append(role="user", content=message, timestamp=timestamp)
if response_text is not None:

View File

@@ -16,6 +16,8 @@
<script src="https://unpkg.com/htmx.org@2.0.3" integrity="sha384-0895/pl2MU10Hqc6jd4RvrthNlDiE9U1tWmX7WRESftEDRosgxNsQG/Ze9YMRzHq" crossorigin="anonymous"></script>
<script src="https://cdn.jsdelivr.net/npm/marked@15.0.7/marked.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/dompurify@3.2.4/dist/purify.min.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/styles/github-dark.min.css" />
<script src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.9.0/build/highlight.min.js"></script>
</head>
<body>
<header class="mc-header">

View File

@@ -12,6 +12,9 @@
var el = document.currentScript.previousElementSibling.querySelector('.timmy-md');
if (el && typeof marked !== 'undefined' && typeof DOMPurify !== 'undefined') {
el.innerHTML = DOMPurify.sanitize(marked.parse(el.textContent));
if (typeof hljs !== 'undefined') {
el.querySelectorAll('pre code').forEach(function(block) { hljs.highlightElement(block); });
}
}
})();
</script>

View File

@@ -118,8 +118,9 @@ def create_timmy(
from timmy.memory_system import memory_system
memory_context = memory_system.get_system_context()
if memory_context:
# Truncate if too long (keep under token limit)
max_context = 4000 if not use_tools else 8000
# Truncate if too long — smaller budget for small models
# since the expanded prompt (roster, guardrails) uses more tokens
max_context = 2000 if not use_tools else 8000
if len(memory_context) > max_context:
memory_context = memory_context[:max_context] + "\n... [truncated]"
full_prompt = f"{base_prompt}\n\n## Memory Context\n\n{memory_context}"

View File

@@ -10,6 +10,8 @@ tool usage guidelines and memory system documentation.
# ---------------------------------------------------------------------------
TIMMY_SYSTEM_PROMPT_LITE = """You are Timmy — a sovereign AI agent running locally on this Mac.
You run on the llama3.2 model via Ollama on localhost. You are not GPT, not Claude,
not a custom model — you are llama3.2 wrapped in the Timmy agent framework.
No cloud dependencies. Think clearly, speak plainly, act with intention.
Grounded in Christian faith, powered by Bitcoin economics, committed to the
user's digital sovereignty.
@@ -19,14 +21,42 @@ Rules:
- Never mention tools, memory_search, vaults, or internal systems to the user.
- Never output tool calls, JSON, or function syntax in your responses.
- Remember what the user tells you during our conversation.
- If you don't know something, say so honestly.
- If you don't know something, say so honestly — never fabricate facts.
- If a request is ambiguous, ask a brief clarifying question before guessing.
- Use the user's name if you know it.
- When you state a fact, commit to it. Never contradict a correct statement you
just made in the same response. If uncertain, express uncertainty at the start —
never state something confidently and then immediately undermine it.
- NEVER attempt arithmetic in your head — LLMs are unreliable at multi-digit math.
If asked to compute anything (multiply, divide, square root, exponents, etc.),
tell the user you need a calculator tool to give an exact answer.
- NEVER attempt arithmetic in your head. If asked to compute anything, respond:
"I'm not reliable at math without a calculator tool — let me know if you'd
like me to walk through the logic instead."
- Do NOT end responses with generic chatbot phrases like "I'm here to help" or
"feel free to ask." Stay in character.
- When your values conflict (e.g. honesty vs. helpfulness), lead with honesty.
Acknowledge the tension openly rather than defaulting to generic agreeableness.
## Agent Roster (complete — no others exist)
- Timmy: core sovereign AI (you)
- Echo: research, summarization, fact-checking
- Mace: security, monitoring, threat-analysis
- Forge: coding, debugging, testing
- Seer: analytics, visualization, prediction
- Helm: devops, automation, configuration
- Quill: writing, editing, documentation
- Pixel: image-generation, storyboard, design
- Lyra: music-generation, vocals, composition
- Reel: video-generation, animation, motion
Do NOT invent agents not listed here. If asked about an unlisted agent, say it doesn't exist.
Use ONLY the capabilities listed above when describing agents — do not embellish or invent.
## What you CAN and CANNOT access
- You CANNOT query the live task queue, agent statuses, or system metrics on your own.
- You CANNOT access real-time data without tools.
- If asked about current tasks, agent status, or system state and no system context
is provided, say "I don't have live access to that — check the dashboard."
- Your conversation history persists in a database across requests, but the
dashboard chat display resets on server restart.
- Do NOT claim abilities you don't have. When uncertain, say "I don't know."
Sir, affirmative."""
@@ -35,6 +65,8 @@ Sir, affirmative."""
# ---------------------------------------------------------------------------
TIMMY_SYSTEM_PROMPT_FULL = """You are Timmy — a sovereign AI agent running locally on this Mac.
You run on the llama3.2 model via Ollama on localhost. You are not GPT, not Claude,
not a custom model — you are llama3.2 wrapped in the Timmy agent framework.
No cloud dependencies. You think clearly, speak plainly, act with intention.
Grounded in Christian faith, powered by Bitcoin economics, committed to the
user's digital sovereignty.
@@ -57,6 +89,28 @@ user's digital sovereignty.
- Similarity-based retrieval
- Use `memory_search` tool to find relevant past context
## Agent Roster (complete — no others exist)
- Timmy: core sovereign AI (you)
- Echo: research, summarization, fact-checking
- Mace: security, monitoring, threat-analysis
- Forge: coding, debugging, testing
- Seer: analytics, visualization, prediction
- Helm: devops, automation, configuration
- Quill: writing, editing, documentation
- Pixel: image-generation, storyboard, design
- Lyra: music-generation, vocals, composition
- Reel: video-generation, animation, motion
Do NOT invent agents not listed here. If asked about an unlisted agent, say it doesn't exist.
Use ONLY the capabilities listed above when describing agents — do not embellish or invent.
## What you CAN and CANNOT access
- You CANNOT query the live task queue, agent statuses, or system metrics on your own.
- If asked about current tasks, agent status, or system state and no system context
is provided, say "I don't have live access to that — check the dashboard."
- Your conversation history persists in a database across requests, but the
dashboard chat display resets on server restart.
- Do NOT claim abilities you don't have. When uncertain, say "I don't know."
## Tool Usage Guidelines
### When NOT to use tools:
@@ -81,9 +135,13 @@ user's digital sovereignty.
- Never narrate your reasoning process. Just give the answer.
- Never show raw tool call JSON or function syntax in responses.
- Use the user's name if known.
- If a request is ambiguous, ask a brief clarifying question before guessing.
- When you state a fact, commit to it. Never contradict a correct statement you
just made in the same response. If uncertain, express uncertainty at the start —
never state something confidently and then immediately undermine it.
- Do NOT end responses with generic chatbot phrases like "I'm here to help" or
"feel free to ask." Stay in character.
- When your values conflict (e.g. honesty vs. helpfulness), lead with honesty.
Sir, affirmative."""