fix: chat evaluation bugs — task pipeline, prompt grounding, markdown rendering

Addresses 14 bugs from 3 rounds of deep chat evaluation: - Add chat-to-task pipeline in agents.py with regex-based intent detection, agent extraction, priority extraction, and title cleaning - Filter meta-questions ("how do I create a task?") from task creation - Inject real-time date/time context into every chat message - Inject live queue state when user asks about tasks - Ground system prompts with agent roster, honesty guardrails, self-knowledge, math delegation template, anti-filler rules, values-conflict guidance - Add CSS for markdown code blocks, inline code, lists, blockquotes in chat - Add highlight.js CDN for syntax highlighting in chat responses - Reduce small-model memory context budget (4000→2000) for expanded prompt - Add 27 comprehensive tests covering the full chat-to-task pipeline Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 11:42:42 -05:00
parent 6c6b6f8a54
commit 3ca8e9f2d6
7 changed files with 526 additions and 10 deletions
--- a/tests/test_task_queue.py
+++ b/tests/test_task_queue.py
@@ -304,3 +304,244 @@ def test_api_approve_nonexistent(client):
 def test_api_veto_nonexistent(client):
    resp = client.patch("/api/tasks/nonexistent/veto")
    assert resp.status_code == 404
+
+
+# ── Chat-to-Task Pipeline Tests ──────────────────────────────────────────
+
+
+class TestExtractTaskFromMessage:
+    """Tests for _extract_task_from_message — queue intent detection."""
+
+    def test_add_to_queue(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Add refactor the login to the task queue")
+        assert result is not None
+        assert result["agent"] == "timmy"
+        assert result["priority"] == "normal"
+
+    def test_schedule_this(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Schedule this for later")
+        assert result is not None
+
+    def test_create_a_task(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Create a task to fix the login page")
+        assert result is not None
+        assert "title" in result
+
+    def test_normal_message_returns_none(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        assert _extract_task_from_message("Hello, how are you?") is None
+
+    def test_meta_question_about_tasks_returns_none(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        assert _extract_task_from_message("How do I create a task?") is None
+
+    def test_what_is_question_returns_none(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        assert _extract_task_from_message("What is a task queue?") is None
+
+    def test_explain_question_returns_none(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        assert _extract_task_from_message("Can you explain how to create a task?") is None
+
+    def test_what_would_question_returns_none(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        assert _extract_task_from_message("What would a task flow look like?") is None
+
+
+class TestExtractAgentFromMessage:
+    """Tests for _extract_agent_from_message."""
+
+    def test_extracts_forge(self):
+        from dashboard.routes.agents import _extract_agent_from_message
+        assert _extract_agent_from_message("Create a task for Forge to refactor") == "forge"
+
+    def test_extracts_echo(self):
+        from dashboard.routes.agents import _extract_agent_from_message
+        assert _extract_agent_from_message("Add research for Echo to the queue") == "echo"
+
+    def test_case_insensitive(self):
+        from dashboard.routes.agents import _extract_agent_from_message
+        assert _extract_agent_from_message("Schedule this for SEER") == "seer"
+
+    def test_defaults_to_timmy(self):
+        from dashboard.routes.agents import _extract_agent_from_message
+        assert _extract_agent_from_message("Create a task to fix the bug") == "timmy"
+
+    def test_ignores_unknown_agent(self):
+        from dashboard.routes.agents import _extract_agent_from_message
+        assert _extract_agent_from_message("Create a task for BobAgent") == "timmy"
+
+
+class TestExtractPriorityFromMessage:
+    """Tests for _extract_priority_from_message."""
+
+    def test_urgent(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("urgent: fix the server") == "urgent"
+
+    def test_critical(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("This is critical, do it now") == "urgent"
+
+    def test_asap(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("Fix this ASAP") == "urgent"
+
+    def test_high_priority(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("This is important work") == "high"
+
+    def test_low_priority(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("minor cleanup task") == "low"
+
+    def test_default_normal(self):
+        from dashboard.routes.agents import _extract_priority_from_message
+        assert _extract_priority_from_message("Fix the login page") == "normal"
+
+
+class TestTitleCleaning:
+    """Tests for task title extraction and cleaning."""
+
+    def test_strips_agent_from_title(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Create a task for Forge to refactor the login")
+        assert result is not None
+        assert "forge" not in result["title"].lower()
+        assert "for" not in result["title"].lower().split()[0:1]  # "for" stripped
+
+    def test_strips_priority_from_title(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Create an urgent task to fix the server")
+        assert result is not None
+        assert "urgent" not in result["title"].lower()
+
+    def test_title_is_capitalized(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Add refactor the login to the task queue")
+        assert result is not None
+        assert result["title"][0].isupper()
+
+    def test_title_capped_at_120_chars(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        long_msg = "Create a task to " + "x" * 200
+        result = _extract_task_from_message(long_msg)
+        assert result is not None
+        assert len(result["title"]) <= 120
+
+
+class TestFullExtraction:
+    """Tests for combined agent + priority + title extraction."""
+
+    def test_task_includes_agent_and_priority(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Create a high priority task for Forge to refactor auth")
+        assert result is not None
+        assert result["agent"] == "forge"
+        assert result["priority"] == "high"
+        assert result["description"]  # original message preserved
+
+    def test_create_with_all_fields(self):
+        from dashboard.routes.agents import _extract_task_from_message
+        result = _extract_task_from_message("Add an urgent task for Mace to audit security to the queue")
+        assert result is not None
+        assert result["agent"] == "mace"
+        assert result["priority"] == "urgent"
+
+
+# ── Integration: chat_timmy Route ─────────────────────────────────────────
+
+
+class TestChatTimmyIntegration:
+    """Integration tests for the /agents/timmy/chat route."""
+
+    def test_chat_creates_task_on_queue_request(self, client):
+        resp = client.post(
+            "/agents/timmy/chat",
+            data={"message": "Create a task to refactor the login module"},
+        )
+        assert resp.status_code == 200
+        assert "Task queued" in resp.text or "task" in resp.text.lower()
+
+    def test_chat_creates_task_with_agent(self, client):
+        resp = client.post(
+            "/agents/timmy/chat",
+            data={"message": "Add deploy monitoring for Helm to the task queue"},
+        )
+        assert resp.status_code == 200
+        assert "helm" in resp.text.lower() or "Task queued" in resp.text
+
+    def test_chat_creates_task_with_priority(self, client):
+        resp = client.post(
+            "/agents/timmy/chat",
+            data={"message": "Create an urgent task to fix the production server"},
+        )
+        assert resp.status_code == 200
+        assert "Task queued" in resp.text or "urgent" in resp.text.lower()
+
+    @patch("dashboard.routes.agents.timmy_chat")
+    def test_chat_injects_datetime_context(self, mock_chat, client):
+        mock_chat.return_value = "Hello there!"
+        client.post(
+            "/agents/timmy/chat",
+            data={"message": "Hello Timmy"},
+        )
+        mock_chat.assert_called_once()
+        call_arg = mock_chat.call_args[0][0]
+        assert "[System: Current date/time is" in call_arg
+
+    @patch("dashboard.routes.agents.timmy_chat")
+    @patch("dashboard.routes.agents._build_queue_context")
+    def test_chat_injects_queue_context_on_queue_query(self, mock_ctx, mock_chat, client):
+        mock_ctx.return_value = "[System: Task queue — 3 pending approval, 1 running, 5 completed.]"
+        mock_chat.return_value = "There are 3 tasks pending."
+        client.post(
+            "/agents/timmy/chat",
+            data={"message": "What tasks are in the queue?"},
+        )
+        mock_ctx.assert_called_once()
+        mock_chat.assert_called_once()
+        call_arg = mock_chat.call_args[0][0]
+        assert "[System: Task queue" in call_arg
+
+    @patch("dashboard.routes.agents.timmy_chat")
+    @patch("dashboard.routes.agents._build_queue_context")
+    def test_chat_no_queue_context_for_normal_message(self, mock_ctx, mock_chat, client):
+        mock_chat.return_value = "Hi!"
+        client.post(
+            "/agents/timmy/chat",
+            data={"message": "Tell me a joke"},
+        )
+        mock_ctx.assert_not_called()
+
+    @patch("dashboard.routes.agents.timmy_chat")
+    def test_chat_normal_message_uses_timmy(self, mock_chat, client):
+        mock_chat.return_value = "I'm doing well, thank you."
+        resp = client.post(
+            "/agents/timmy/chat",
+            data={"message": "How are you?"},
+        )
+        assert resp.status_code == 200
+        mock_chat.assert_called_once()
+
+
+class TestBuildQueueContext:
+    """Tests for _build_queue_context helper."""
+
+    def test_returns_string_with_counts(self):
+        from dashboard.routes.agents import _build_queue_context
+        from task_queue.models import create_task
+        create_task(title="Context test task", created_by="test")
+        ctx = _build_queue_context()
+        assert "[System: Task queue" in ctx
+        assert "pending" in ctx.lower()
+
+    def test_returns_empty_on_error(self):
+        from dashboard.routes.agents import _build_queue_context
+        with patch("task_queue.models.get_counts_by_status", side_effect=Exception("DB error")):
+            ctx = _build_queue_context()
+            assert isinstance(ctx, str)
+            assert ctx == ""