feat: agentic loop for multi-step tasks + regression fixes (#148)

* fix: name extraction blocklist, memory preview escaping, and gitignore cleanup - Add _NAME_BLOCKLIST to extract_user_name() to reject gerunds and UI-state words like "Sending" that were incorrectly captured as user names - Collapse whitespace in get_memory_status() preview so newlines survive JSON serialization without showing raw \n escape sequences - Broaden .gitignore from specific memory/self/user_profile.md to memory/self/ and untrack memory/self/methodology.md (runtime-edited file) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: catch Ollama connection errors in session.py + add 71 smoke tests - Wrap agent.run() in session.py with try/except so Ollama connection failures return a graceful fallback message instead of dumping raw tracebacks to Docker logs - Add tests/test_smoke.py with 71 tests covering every GET route: core pages, feature pages, JSON APIs, and a parametrized no-500 sweep — catches import errors, template failures, and schema mismatches that unit tests miss Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: agentic loop for multi-step tasks + Round 10 regression fixes Agentic loop (Parts 1-4): - Add multi-step chaining instructions to system prompt - New agentic_loop.py with plan→execute→adapt→summarize flow - Register plan_and_execute tool for background task execution - Add max_agent_steps config setting (default: 10) - Discord fix: 300s timeout, typing indicator, send error handling - 16 new unit + e2e tests for agentic loop Round 10 regressions (R1-R5, P1): - R1: Fix literal \n escape sequences in tool responses - R2: Chat timeout/error feedback in agent panel - R3: /hands infinite spinner → static empty states - R4: /self-coding infinite spinner → static stats + journal - R5: /grok/status raw JSON → HTML dashboard template - P1: VETO confirmation dialog on task cards Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: briefing route 500 in CI when agno is MagicMock stub _call_agent() returned a MagicMock instead of a string when agno is stubbed in tests, causing SQLite "Error binding parameter 4" on save. Ensure the return value is always an actual string. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: briefing route 500 in CI — graceful degradation at route level When agno is stubbed with MagicMock in CI, agent.run() returns a MagicMock instead of raising — so the exception handler never fires and a MagicMock propagates as the summary to SQLite, which can't bind it. Fix: catch at the route level and return a fallback Briefing object. This follows the project's graceful degradation pattern — the briefing page always renders, even when the backend is completely unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Trip T <trip@local> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 01:46:29 -05:00
parent b8e0f4539f
commit 7792ae745f
22 changed files with 1206 additions and 142 deletions
--- a/tests/e2e/test_agentic_chain.py
+++ b/tests/e2e/test_agentic_chain.py
@@ -0,0 +1,102 @@
+"""E2E: verify multi-step tool chaining works end-to-end.
+
+These tests validate the full agentic loop pipeline: planning,
+execution, adaptation, and progress tracking.
+"""
+
+import pytest
+from unittest.mock import MagicMock, patch, AsyncMock
+from timmy.agentic_loop import run_agentic_loop
+
+
+def _mock_run(content: str):
+    """Create a mock return value for agent.run()."""
+    m = MagicMock()
+    m.content = content
+    return m
+
+
+@pytest.mark.asyncio
+async def test_multistep_chain_completes_all_steps():
+    """GREEN PATH: multi-step prompt executes all steps."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Search AI news\n2. Write to file\n3. Verify"),
+        _mock_run("Found 5 articles about AI in March 2026."),
+        _mock_run("Wrote summary to /tmp/ai_news.md"),
+        _mock_run("File exists, 15 lines."),
+        _mock_run("Searched, wrote, verified."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Search AI news and write summary to file")
+
+    assert result.status == "completed"
+    assert len(result.steps) == 3
+    assert mock_agent.run.call_count == 5  # plan + 3 steps + summary
+
+
+@pytest.mark.asyncio
+async def test_multistep_chain_adapts_on_failure():
+    """Step failure -> model adapts -> continues."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Read config\n2. Update setting\n3. Verify"),
+        _mock_run("Config: timeout=30"),
+        Exception("Permission denied"),
+        _mock_run("Adapted: wrote to ~/config.yaml instead"),
+        _mock_run("Verified: timeout=60"),
+        _mock_run("Updated config. Used ~/config.yaml due to permissions."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Update config timeout to 60")
+
+    assert result.status == "completed"
+    assert any(s.status == "adapted" for s in result.steps)
+
+
+@pytest.mark.asyncio
+async def test_max_steps_enforced():
+    """Loop stops at max_steps."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. A\n2. B\n3. C\n4. D\n5. E"),
+        _mock_run("A done"),
+        _mock_run("B done"),
+        _mock_run("Completed 2 of 5 steps."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Do 5 things", max_steps=2)
+
+    assert len(result.steps) == 2
+    assert result.status == "partial"
+
+
+@pytest.mark.asyncio
+async def test_progress_events_fire():
+    """Progress callback fires per step."""
+    events = []
+
+    async def on_progress(desc, step, total):
+        events.append((step, total))
+
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Do A\n2. Do B"),
+        _mock_run("A done"),
+        _mock_run("B done"),
+        _mock_run("All done"),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        await run_agentic_loop("Do A and B", on_progress=on_progress)
+
+    assert len(events) == 2
+    assert events[0] == (1, 2)
+    assert events[1] == (2, 2)
--- a/tests/test_agentic_loop.py
+++ b/tests/test_agentic_loop.py
@@ -0,0 +1,213 @@
+"""Unit tests for the agentic loop module.
+
+Tests cover planning, execution, max_steps enforcement, failure
+adaptation, progress callbacks, and response cleaning.
+"""
+
+import pytest
+from unittest.mock import MagicMock, patch, AsyncMock
+from timmy.agentic_loop import (
+    run_agentic_loop,
+    _parse_steps,
+    AgenticResult,
+    AgenticStep,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _mock_run(content: str):
+    """Create a mock return value for agent.run()."""
+    m = MagicMock()
+    m.content = content
+    return m
+
+
+# ---------------------------------------------------------------------------
+# _parse_steps
+# ---------------------------------------------------------------------------
+
+class TestParseSteps:
+    def test_numbered_with_dot(self):
+        text = "1. Search for data\n2. Write to file\n3. Verify"
+        assert _parse_steps(text) == ["Search for data", "Write to file", "Verify"]
+
+    def test_numbered_with_paren(self):
+        text = "1) Read config\n2) Update value\n3) Restart"
+        assert _parse_steps(text) == ["Read config", "Update value", "Restart"]
+
+    def test_fallback_plain_lines(self):
+        text = "Search the web\nWrite results\nDone"
+        assert _parse_steps(text) == ["Search the web", "Write results", "Done"]
+
+    def test_empty_returns_empty(self):
+        assert _parse_steps("") == []
+
+
+# ---------------------------------------------------------------------------
+# run_agentic_loop
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_planning_phase_produces_steps():
+    """Planning prompt returns numbered step list."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Search AI news\n2. Write to file\n3. Verify"),
+        _mock_run("Found 5 articles about AI."),
+        _mock_run("Wrote summary to /tmp/ai_news.md"),
+        _mock_run("File verified, 15 lines."),
+        _mock_run("Searched, wrote, verified."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Search AI news and write summary")
+
+    assert result.status == "completed"
+    assert len(result.steps) == 3
+
+
+@pytest.mark.asyncio
+async def test_loop_executes_all_steps():
+    """Loop calls agent.run() for plan + each step + summary."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Do A\n2. Do B"),
+        _mock_run("A done"),
+        _mock_run("B done"),
+        _mock_run("All done"),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Do A and B")
+
+    # plan + 2 steps + summary = 4 calls
+    assert mock_agent.run.call_count == 4
+    assert len(result.steps) == 2
+
+
+@pytest.mark.asyncio
+async def test_loop_respects_max_steps():
+    """Loop stops at max_steps and returns status='partial'."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. A\n2. B\n3. C\n4. D\n5. E"),
+        _mock_run("A done"),
+        _mock_run("B done"),
+        _mock_run("Completed 2 of 5 steps."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Do 5 things", max_steps=2)
+
+    assert len(result.steps) == 2
+    assert result.status == "partial"
+
+
+@pytest.mark.asyncio
+async def test_failure_triggers_adaptation():
+    """Failed step feeds error back to model, step marked as adapted."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Read config\n2. Update setting\n3. Verify"),
+        _mock_run("Config: timeout=30"),
+        Exception("Permission denied"),
+        _mock_run("Adapted: wrote to ~/config.yaml instead"),
+        _mock_run("Verified: timeout=60"),
+        _mock_run("Updated config via alternative path."),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Update config timeout to 60")
+
+    assert result.status == "completed"
+    assert any(s.status == "adapted" for s in result.steps)
+
+
+@pytest.mark.asyncio
+async def test_progress_callback_fires():
+    """on_progress called for each step completion."""
+    events = []
+
+    async def on_progress(desc, step, total):
+        events.append((step, total))
+
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Do A\n2. Do B"),
+        _mock_run("A done"),
+        _mock_run("B done"),
+        _mock_run("All done"),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        await run_agentic_loop("Do A and B", on_progress=on_progress)
+
+    assert len(events) == 2
+    assert events[0] == (1, 2)
+    assert events[1] == (2, 2)
+
+
+@pytest.mark.asyncio
+async def test_result_contains_step_metadata():
+    """AgenticResult.steps has status and duration per step."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=[
+        _mock_run("1. Search\n2. Write"),
+        _mock_run("Found results"),
+        _mock_run("Written to file"),
+        _mock_run("Done"),
+    ])
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Search and write")
+
+    for step in result.steps:
+        assert step.status in ("completed", "failed", "adapted")
+        assert step.duration_ms >= 0
+        assert step.description
+        assert step.result
+
+
+@pytest.mark.asyncio
+async def test_config_default_used():
+    """When max_steps=0, uses settings.max_agent_steps."""
+    mock_agent = MagicMock()
+    # Return more steps than default config allows (10)
+    steps_text = "\n".join(f"{i}. Step {i}" for i in range(1, 15))
+    side_effects = [_mock_run(steps_text)]
+    # 10 step results + summary
+    for i in range(1, 11):
+        side_effects.append(_mock_run(f"Step {i} done"))
+    side_effects.append(_mock_run("Summary"))
+
+    mock_agent.run = MagicMock(side_effect=side_effects)
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Do 14 things", max_steps=0)
+
+    # Should be capped at 10 (config default)
+    assert len(result.steps) == 10
+
+
+@pytest.mark.asyncio
+async def test_planning_failure_returns_failed():
+    """If the planning phase fails, result.status is 'failed'."""
+    mock_agent = MagicMock()
+    mock_agent.run = MagicMock(side_effect=Exception("Model offline"))
+
+    with patch("timmy.agentic_loop._get_loop_agent", return_value=mock_agent), \
+         patch("timmy.agentic_loop._broadcast_progress", new_callable=AsyncMock):
+        result = await run_agentic_loop("Do something")
+
+    assert result.status == "failed"
+    assert "Planning failed" in result.summary
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -0,0 +1,227 @@
+"""Smoke tests — verify every major page loads without uncaught exceptions.
+
+These tests catch regressions that unit tests miss: import errors,
+template rendering failures, database schema mismatches, and startup
+crashes.  They run fast (no Ollama needed) and should stay green on
+every commit.
+"""
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+@pytest.fixture
+def client():
+    from dashboard.app import app
+    with TestClient(app, raise_server_exceptions=False) as c:
+        yield c
+
+
+# ---------------------------------------------------------------------------
+# Core pages — these MUST return 200
+# ---------------------------------------------------------------------------
+
+class TestCorePages:
+    """Every core dashboard page loads without error."""
+
+    def test_index(self, client):
+        r = client.get("/")
+        assert r.status_code == 200
+
+    def test_health(self, client):
+        r = client.get("/health")
+        assert r.status_code == 200
+
+    def test_health_status(self, client):
+        r = client.get("/health/status")
+        assert r.status_code == 200
+
+    def test_agent_panel(self, client):
+        r = client.get("/agents/default/panel")
+        assert r.status_code == 200
+
+    def test_agent_history(self, client):
+        r = client.get("/agents/default/history")
+        assert r.status_code == 200
+
+
+# ---------------------------------------------------------------------------
+# Feature pages — should return 200 (or 307 redirect, never 500)
+# ---------------------------------------------------------------------------
+
+class TestFeaturePages:
+    """Feature pages load without 500 errors."""
+
+    def test_briefing(self, client):
+        r = client.get("/briefing")
+        assert r.status_code in (200, 307)
+
+    def test_thinking(self, client):
+        r = client.get("/thinking")
+        assert r.status_code == 200
+
+    def test_tools(self, client):
+        r = client.get("/tools")
+        assert r.status_code == 200
+
+    def test_memory(self, client):
+        r = client.get("/memory")
+        assert r.status_code == 200
+
+    def test_calm(self, client):
+        r = client.get("/calm")
+        assert r.status_code == 200
+
+    def test_tasks(self, client):
+        r = client.get("/tasks")
+        assert r.status_code == 200
+
+    def test_work_orders_queue(self, client):
+        r = client.get("/work-orders/queue")
+        assert r.status_code == 200
+
+    def test_mobile(self, client):
+        r = client.get("/mobile")
+        assert r.status_code == 200
+
+    def test_spark(self, client):
+        r = client.get("/spark")
+        assert r.status_code in (200, 307)
+
+    def test_models(self, client):
+        r = client.get("/models")
+        assert r.status_code == 200
+
+    def test_swarm_live(self, client):
+        r = client.get("/swarm/live")
+        assert r.status_code == 200
+
+    def test_swarm_events(self, client):
+        r = client.get("/swarm/events")
+        assert r.status_code == 200
+
+    def test_marketplace(self, client):
+        r = client.get("/marketplace")
+        assert r.status_code in (200, 307)
+
+
+# ---------------------------------------------------------------------------
+# JSON API endpoints — should return valid JSON, never 500
+# ---------------------------------------------------------------------------
+
+class TestAPIEndpoints:
+    """API endpoints return valid JSON without server errors."""
+
+    def test_health_json(self, client):
+        r = client.get("/health")
+        assert r.status_code == 200
+        data = r.json()
+        assert "status" in data
+
+    def test_health_components(self, client):
+        r = client.get("/health/components")
+        assert r.status_code == 200
+
+    def test_health_sovereignty(self, client):
+        r = client.get("/health/sovereignty")
+        assert r.status_code == 200
+
+    def test_queue_status(self, client):
+        r = client.get("/api/queue/status")
+        assert r.status_code == 200
+
+    def test_tasks_api(self, client):
+        r = client.get("/api/tasks")
+        assert r.status_code == 200
+
+    def test_chat_history(self, client):
+        r = client.get("/api/chat/history")
+        assert r.status_code == 200
+
+    def test_tools_stats(self, client):
+        r = client.get("/tools/api/stats")
+        assert r.status_code == 200
+
+    def test_thinking_api(self, client):
+        r = client.get("/thinking/api")
+        assert r.status_code == 200
+
+    def test_notifications_api(self, client):
+        r = client.get("/api/notifications")
+        assert r.status_code == 200
+
+    def test_providers_api(self, client):
+        r = client.get("/router/api/providers")
+        assert r.status_code == 200
+
+    def test_mobile_status(self, client):
+        r = client.get("/mobile/status")
+        assert r.status_code == 200
+
+    def test_discord_status(self, client):
+        r = client.get("/discord/status")
+        assert r.status_code == 200
+
+    def test_telegram_status(self, client):
+        r = client.get("/telegram/status")
+        assert r.status_code == 200
+
+    def test_grok_status(self, client):
+        r = client.get("/grok/status")
+        assert r.status_code == 200
+
+    def test_paperclip_status(self, client):
+        r = client.get("/api/paperclip/status")
+        assert r.status_code == 200
+
+
+# ---------------------------------------------------------------------------
+# No 500s — every GET route should survive without server error
+# ---------------------------------------------------------------------------
+
+class TestNo500:
+    """Verify that no page returns a 500 Internal Server Error."""
+
+    @pytest.mark.parametrize("path", [
+        "/",
+        "/health",
+        "/health/status",
+        "/health/sovereignty",
+        "/health/components",
+        "/agents/default/panel",
+        "/agents/default/history",
+        "/briefing",
+        "/thinking",
+        "/thinking/api",
+        "/tools",
+        "/tools/api/stats",
+        "/memory",
+        "/calm",
+        "/tasks",
+        "/tasks/pending",
+        "/tasks/active",
+        "/tasks/completed",
+        "/work-orders/queue",
+        "/work-orders/queue/pending",
+        "/work-orders/queue/active",
+        "/mobile",
+        "/mobile/status",
+        "/spark",
+        "/models",
+        "/swarm/live",
+        "/swarm/events",
+        "/marketplace",
+        "/api/queue/status",
+        "/api/tasks",
+        "/api/chat/history",
+        "/api/notifications",
+        "/router/api/providers",
+        "/discord/status",
+        "/telegram/status",
+        "/grok/status",
+        "/grok/stats",
+        "/api/paperclip/status",
+    ])
+    def test_no_500(self, client, path):
+        r = client.get(path)
+        assert r.status_code != 500, f"GET {path} returned 500"
--- a/tests/timmy/test_grok_backend.py
+++ b/tests/timmy/test_grok_backend.py
@@ -249,14 +249,14 @@ def test_consult_grok_calls_backend_when_available():
 # ── Grok dashboard route tests ─────────────────────────────────────────────

 def test_grok_status_endpoint(client):
-    """GET /grok/status returns JSON with Grok configuration."""
+    """GET /grok/status returns HTML dashboard page."""
    response = client.get("/grok/status")
    assert response.status_code == 200
-    data = response.json()
-    assert "enabled" in data
-    assert "available" in data
-    assert "model" in data
-    assert "api_key_set" in data
+    assert "text/html" in response.headers.get("content-type", "")
+    # Verify key status info is present in the rendered HTML
+    text = response.text
+    assert "Grok Status" in text
+    assert "Status" in text


 def test_grok_toggle_returns_html(client):