1
0

fix: chat evaluation bugs — task pipeline, prompt grounding, markdown rendering

Addresses 14 bugs from 3 rounds of deep chat evaluation:

- Add chat-to-task pipeline in agents.py with regex-based intent detection,
  agent extraction, priority extraction, and title cleaning
- Filter meta-questions ("how do I create a task?") from task creation
- Inject real-time date/time context into every chat message
- Inject live queue state when user asks about tasks
- Ground system prompts with agent roster, honesty guardrails, self-knowledge,
  math delegation template, anti-filler rules, values-conflict guidance
- Add CSS for markdown code blocks, inline code, lists, blockquotes in chat
- Add highlight.js CDN for syntax highlighting in chat responses
- Reduce small-model memory context budget (4000→2000) for expanded prompt
- Add 27 comprehensive tests covering the full chat-to-task pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alexander Payne
2026-02-26 11:42:42 -05:00
parent 6c6b6f8a54
commit 3ca8e9f2d6
7 changed files with 526 additions and 10 deletions

View File

@@ -1,3 +1,5 @@
import logging
import re
from datetime import datetime
from pathlib import Path
@@ -8,9 +10,144 @@ from fastapi.templating import Jinja2Templates
from timmy.session import chat as timmy_chat
from dashboard.store import message_log
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/agents", tags=["agents"])
templates = Jinja2Templates(directory=str(Path(__file__).parent.parent / "templates"))
# ── Task queue detection ──────────────────────────────────────────────────
# Patterns that indicate the user wants to queue a task rather than chat
_QUEUE_PATTERNS = [
re.compile(r"\b(?:add|put|schedule|queue|submit)\b.*\b(?:to the|on the|in the)?\s*(?:queue|task(?:\s*queue)?|task list)\b", re.IGNORECASE),
re.compile(r"\bschedule\s+(?:this|that|a)\b", re.IGNORECASE),
re.compile(r"\bcreate\s+(?:a\s+|an\s+)?(?:\w+\s+){0,3}task\b", re.IGNORECASE),
]
# Questions about tasks/queue should NOT trigger task creation
_QUESTION_PREFIXES = re.compile(
r"^(?:what|how|why|can you explain|could you explain|tell me about|explain|"
r"what(?:'s| is| are| would))\b",
re.IGNORECASE,
)
_QUESTION_FRAMES = re.compile(
r"\b(?:how (?:do|does|would|can|should)|what (?:is|are|would)|"
r"can you (?:explain|describe|tell)|best way to)\b",
re.IGNORECASE,
)
# Known agent names for task assignment parsing
_KNOWN_AGENTS = frozenset({
"timmy", "echo", "mace", "helm", "seer",
"forge", "quill", "pixel", "lyra", "reel",
})
_AGENT_PATTERN = re.compile(
r"\bfor\s+(" + "|".join(_KNOWN_AGENTS) + r")\b", re.IGNORECASE
)
# Priority keywords → task priority mapping
_PRIORITY_MAP = {
"urgent": "urgent",
"critical": "urgent",
"asap": "urgent",
"emergency": "urgent",
"high priority": "high",
"high-priority": "high",
"important": "high",
"low priority": "low",
"low-priority": "low",
"minor": "low",
}
# Queue context detection
_QUEUE_QUERY_PATTERN = re.compile(
r"\b(?:task(?:s|\s+queue)?|queue|what(?:'s| is) (?:in |on )?(?:the )?queue)\b",
re.IGNORECASE,
)
def _extract_agent_from_message(message: str) -> str:
"""Extract target agent name from message, defaulting to 'timmy'."""
m = _AGENT_PATTERN.search(message)
if m:
return m.group(1).lower()
return "timmy"
def _extract_priority_from_message(message: str) -> str:
"""Extract priority level from message, defaulting to 'normal'."""
msg_lower = message.lower()
for keyword, priority in sorted(_PRIORITY_MAP.items(), key=lambda x: -len(x[0])):
if keyword in msg_lower:
return priority
return "normal"
def _extract_task_from_message(message: str) -> dict | None:
"""If the message looks like a task-queue request, return task details.
Returns None for meta-questions about tasks (e.g. "how do I create a task?").
"""
if _QUESTION_PREFIXES.search(message) or _QUESTION_FRAMES.search(message):
return None
for pattern in _QUEUE_PATTERNS:
if pattern.search(message):
# Strip the queue instruction to get the actual task description
title = re.sub(
r"\b(?:add|put|schedule|queue|submit|create)\b.*?\b(?:to the|on the|in the|an?)?(?:\s+\w+){0,3}\s*(?:queue|task(?:\s*queue)?|task list)\b",
"", message, flags=re.IGNORECASE,
).strip(" ,:;-")
# Strip "for {agent}" from title
title = _AGENT_PATTERN.sub("", title).strip(" ,:;-")
# Strip priority keywords from title
title = re.sub(
r"\b(?:urgent|critical|asap|emergency|high[- ]priority|important|low[- ]priority|minor)\b",
"", title, flags=re.IGNORECASE,
).strip(" ,:;-")
# Strip leading "to " that often remains
title = re.sub(r"^to\s+", "", title, flags=re.IGNORECASE).strip()
# Clean up double spaces
title = re.sub(r"\s{2,}", " ", title).strip()
# Fallback to full message if stripping removed everything
if not title or len(title) < 5:
title = message
# Capitalize first letter
title = title[0].upper() + title[1:] if title else title
agent = _extract_agent_from_message(message)
priority = _extract_priority_from_message(message)
return {
"title": title[:120],
"description": message,
"agent": agent,
"priority": priority,
}
return None
def _build_queue_context() -> str:
"""Build a concise task queue summary for context injection."""
try:
from task_queue.models import get_counts_by_status, list_tasks, TaskStatus
counts = get_counts_by_status()
pending = counts.get("pending_approval", 0)
running = counts.get("running", 0)
completed = counts.get("completed", 0)
parts = [f"[System: Task queue — {pending} pending approval, {running} running, {completed} completed."]
if pending > 0:
tasks = list_tasks(status=TaskStatus.PENDING_APPROVAL, limit=5)
if tasks:
items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
parts.append(f"Pending: {items}.")
if running > 0:
tasks = list_tasks(status=TaskStatus.RUNNING, limit=5)
if tasks:
items = ", ".join(f'"{t.title}" ({t.assigned_to})' for t in tasks)
parts.append(f"Running: {items}.")
return " ".join(parts) + "]"
except Exception as exc:
logger.debug("Failed to build queue context: %s", exc)
return ""
# Static metadata for known agents — enriched onto live registry entries.
_AGENT_METADATA: dict[str, dict] = {
"timmy": {
@@ -74,10 +211,46 @@ async def chat_timmy(request: Request, message: str = Form(...)):
response_text = None
error_text = None
try:
response_text = timmy_chat(message)
except Exception as exc:
error_text = f"Timmy is offline: {exc}"
# Check if the user wants to queue a task instead of chatting
task_info = _extract_task_from_message(message)
if task_info:
try:
from task_queue.models import create_task
task = create_task(
title=task_info["title"],
description=task_info["description"],
created_by="user",
assigned_to=task_info.get("agent", "timmy"),
priority=task_info.get("priority", "normal"),
requires_approval=True,
)
priority_label = f" | Priority: `{task.priority.value}`" if task.priority.value != "normal" else ""
response_text = (
f"Task queued for approval: **{task.title}**\n\n"
f"Assigned to: `{task.assigned_to}`{priority_label} | "
f"Status: `{task.status.value}` | "
f"[View Task Queue](/tasks)"
)
logger.info("Chat → task queue: %s%s (id=%s)", task.title, task.assigned_to, task.id)
except Exception as exc:
logger.error("Failed to create task from chat: %s", exc)
task_info = None
# Normal chat path (also used as fallback if task creation failed)
if not task_info:
try:
now = datetime.now()
context_parts = [
f"[System: Current date/time is {now.strftime('%A, %B %d, %Y at %I:%M %p')}]"
]
if _QUEUE_QUERY_PATTERN.search(message):
queue_ctx = _build_queue_context()
if queue_ctx:
context_parts.append(queue_ctx)
context_prefix = "\n".join(context_parts) + "\n\n"
response_text = timmy_chat(context_prefix + message)
except Exception as exc:
error_text = f"Timmy is offline: {exc}"
message_log.append(role="user", content=message, timestamp=timestamp)
if response_text is not None: