fix: chat evaluation bugs — task pipeline, prompt grounding, markdown rendering

Addresses 14 bugs from 3 rounds of deep chat evaluation:

- Add chat-to-task pipeline in agents.py with regex-based intent detection,
  agent extraction, priority extraction, and title cleaning
- Filter meta-questions ("how do I create a task?") from task creation
- Inject real-time date/time context into every chat message
- Inject live queue state when user asks about tasks
- Ground system prompts with agent roster, honesty guardrails, self-knowledge,
  math delegation template, anti-filler rules, values-conflict guidance
- Add CSS for markdown code blocks, inline code, lists, blockquotes in chat
- Add highlight.js CDN for syntax highlighting in chat responses
- Reduce small-model memory context budget (4000→2000) for expanded prompt
- Add 27 comprehensive tests covering the full chat-to-task pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alexander Payne
2026-02-26 11:42:42 -05:00
parent 6c6b6f8a54
commit 3ca8e9f2d6
7 changed files with 526 additions and 10 deletions

View File

@@ -304,3 +304,244 @@ def test_api_approve_nonexistent(client):
def test_api_veto_nonexistent(client):
resp = client.patch("/api/tasks/nonexistent/veto")
assert resp.status_code == 404
# ── Chat-to-Task Pipeline Tests ──────────────────────────────────────────
class TestExtractTaskFromMessage:
"""Tests for _extract_task_from_message — queue intent detection."""
def test_add_to_queue(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Add refactor the login to the task queue")
assert result is not None
assert result["agent"] == "timmy"
assert result["priority"] == "normal"
def test_schedule_this(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Schedule this for later")
assert result is not None
def test_create_a_task(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Create a task to fix the login page")
assert result is not None
assert "title" in result
def test_normal_message_returns_none(self):
from dashboard.routes.agents import _extract_task_from_message
assert _extract_task_from_message("Hello, how are you?") is None
def test_meta_question_about_tasks_returns_none(self):
from dashboard.routes.agents import _extract_task_from_message
assert _extract_task_from_message("How do I create a task?") is None
def test_what_is_question_returns_none(self):
from dashboard.routes.agents import _extract_task_from_message
assert _extract_task_from_message("What is a task queue?") is None
def test_explain_question_returns_none(self):
from dashboard.routes.agents import _extract_task_from_message
assert _extract_task_from_message("Can you explain how to create a task?") is None
def test_what_would_question_returns_none(self):
from dashboard.routes.agents import _extract_task_from_message
assert _extract_task_from_message("What would a task flow look like?") is None
class TestExtractAgentFromMessage:
"""Tests for _extract_agent_from_message."""
def test_extracts_forge(self):
from dashboard.routes.agents import _extract_agent_from_message
assert _extract_agent_from_message("Create a task for Forge to refactor") == "forge"
def test_extracts_echo(self):
from dashboard.routes.agents import _extract_agent_from_message
assert _extract_agent_from_message("Add research for Echo to the queue") == "echo"
def test_case_insensitive(self):
from dashboard.routes.agents import _extract_agent_from_message
assert _extract_agent_from_message("Schedule this for SEER") == "seer"
def test_defaults_to_timmy(self):
from dashboard.routes.agents import _extract_agent_from_message
assert _extract_agent_from_message("Create a task to fix the bug") == "timmy"
def test_ignores_unknown_agent(self):
from dashboard.routes.agents import _extract_agent_from_message
assert _extract_agent_from_message("Create a task for BobAgent") == "timmy"
class TestExtractPriorityFromMessage:
"""Tests for _extract_priority_from_message."""
def test_urgent(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("urgent: fix the server") == "urgent"
def test_critical(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("This is critical, do it now") == "urgent"
def test_asap(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("Fix this ASAP") == "urgent"
def test_high_priority(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("This is important work") == "high"
def test_low_priority(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("minor cleanup task") == "low"
def test_default_normal(self):
from dashboard.routes.agents import _extract_priority_from_message
assert _extract_priority_from_message("Fix the login page") == "normal"
class TestTitleCleaning:
"""Tests for task title extraction and cleaning."""
def test_strips_agent_from_title(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Create a task for Forge to refactor the login")
assert result is not None
assert "forge" not in result["title"].lower()
assert "for" not in result["title"].lower().split()[0:1] # "for" stripped
def test_strips_priority_from_title(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Create an urgent task to fix the server")
assert result is not None
assert "urgent" not in result["title"].lower()
def test_title_is_capitalized(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Add refactor the login to the task queue")
assert result is not None
assert result["title"][0].isupper()
def test_title_capped_at_120_chars(self):
from dashboard.routes.agents import _extract_task_from_message
long_msg = "Create a task to " + "x" * 200
result = _extract_task_from_message(long_msg)
assert result is not None
assert len(result["title"]) <= 120
class TestFullExtraction:
"""Tests for combined agent + priority + title extraction."""
def test_task_includes_agent_and_priority(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Create a high priority task for Forge to refactor auth")
assert result is not None
assert result["agent"] == "forge"
assert result["priority"] == "high"
assert result["description"] # original message preserved
def test_create_with_all_fields(self):
from dashboard.routes.agents import _extract_task_from_message
result = _extract_task_from_message("Add an urgent task for Mace to audit security to the queue")
assert result is not None
assert result["agent"] == "mace"
assert result["priority"] == "urgent"
# ── Integration: chat_timmy Route ─────────────────────────────────────────
class TestChatTimmyIntegration:
"""Integration tests for the /agents/timmy/chat route."""
def test_chat_creates_task_on_queue_request(self, client):
resp = client.post(
"/agents/timmy/chat",
data={"message": "Create a task to refactor the login module"},
)
assert resp.status_code == 200
assert "Task queued" in resp.text or "task" in resp.text.lower()
def test_chat_creates_task_with_agent(self, client):
resp = client.post(
"/agents/timmy/chat",
data={"message": "Add deploy monitoring for Helm to the task queue"},
)
assert resp.status_code == 200
assert "helm" in resp.text.lower() or "Task queued" in resp.text
def test_chat_creates_task_with_priority(self, client):
resp = client.post(
"/agents/timmy/chat",
data={"message": "Create an urgent task to fix the production server"},
)
assert resp.status_code == 200
assert "Task queued" in resp.text or "urgent" in resp.text.lower()
@patch("dashboard.routes.agents.timmy_chat")
def test_chat_injects_datetime_context(self, mock_chat, client):
mock_chat.return_value = "Hello there!"
client.post(
"/agents/timmy/chat",
data={"message": "Hello Timmy"},
)
mock_chat.assert_called_once()
call_arg = mock_chat.call_args[0][0]
assert "[System: Current date/time is" in call_arg
@patch("dashboard.routes.agents.timmy_chat")
@patch("dashboard.routes.agents._build_queue_context")
def test_chat_injects_queue_context_on_queue_query(self, mock_ctx, mock_chat, client):
mock_ctx.return_value = "[System: Task queue — 3 pending approval, 1 running, 5 completed.]"
mock_chat.return_value = "There are 3 tasks pending."
client.post(
"/agents/timmy/chat",
data={"message": "What tasks are in the queue?"},
)
mock_ctx.assert_called_once()
mock_chat.assert_called_once()
call_arg = mock_chat.call_args[0][0]
assert "[System: Task queue" in call_arg
@patch("dashboard.routes.agents.timmy_chat")
@patch("dashboard.routes.agents._build_queue_context")
def test_chat_no_queue_context_for_normal_message(self, mock_ctx, mock_chat, client):
mock_chat.return_value = "Hi!"
client.post(
"/agents/timmy/chat",
data={"message": "Tell me a joke"},
)
mock_ctx.assert_not_called()
@patch("dashboard.routes.agents.timmy_chat")
def test_chat_normal_message_uses_timmy(self, mock_chat, client):
mock_chat.return_value = "I'm doing well, thank you."
resp = client.post(
"/agents/timmy/chat",
data={"message": "How are you?"},
)
assert resp.status_code == 200
mock_chat.assert_called_once()
class TestBuildQueueContext:
"""Tests for _build_queue_context helper."""
def test_returns_string_with_counts(self):
from dashboard.routes.agents import _build_queue_context
from task_queue.models import create_task
create_task(title="Context test task", created_by="test")
ctx = _build_queue_context()
assert "[System: Task queue" in ctx
assert "pending" in ctx.lower()
def test_returns_empty_on_error(self):
from dashboard.routes.agents import _build_queue_context
with patch("task_queue.models.get_counts_by_status", side_effect=Exception("DB error")):
ctx = _build_queue_context()
assert isinstance(ctx, str)
assert ctx == ""