feat: add functional Ollama chat tests with containerised LLM

Add an ollama service (behind --profile ollama) to the test compose stack and a new test suite that verifies real LLM inference end-to-end: - docker-compose.test.yml: add ollama/ollama service with health check, make OLLAMA_URL and OLLAMA_MODEL configurable via env vars - tests/functional/test_ollama_chat.py: session-scoped fixture that brings up Ollama + dashboard, pulls qwen2.5:0.5b (~400MB, CPU-only), and runs chat/history/multi-turn tests against the live stack - Makefile: add `make test-ollama` target Run with: make test-ollama (or FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v) https://claude.ai/code/session_01NTEzfRHSZQCfkfypxgyHKk
2026-02-25 02:44:36 +00:00
parent df222f7d7e
commit 78cf91697c
3 changed files with 271 additions and 2 deletions
--- a/6
+++ b/6
@@ -62,6 +62,12 @@ test-cov-html:
 	$(PYTEST) tests/ --cov=src --cov-report=term-missing --cov-report=html -q
 	@echo "✓ HTML coverage report: open htmlcov/index.html"

+# Full-stack functional test: spins up Ollama (CPU, qwen2.5:0.5b) + dashboard
+# in Docker and verifies real LLM chat end-to-end.
+# Override model: make test-ollama OLLAMA_TEST_MODEL=tinyllama
+test-ollama:
+	FUNCTIONAL_DOCKER=1 $(PYTEST) tests/functional/test_ollama_chat.py -v --tb=long -x
+
 # ── Code quality ──────────────────────────────────────────────────────────────

 lint:
--- a/docker-compose.test.yml
+++ b/docker-compose.test.yml
@@ -3,9 +3,18 @@
 # Lightweight compose for functional tests.  Runs the dashboard on port 18000
 # and optional agent workers on the swarm-test-net network.
 #
+# Profiles:
+#   (default)   dashboard only (Ollama on host via host.docker.internal)
+#   ollama      adds a containerised Ollama instance + auto model pull
+#   agents      adds scalable agent workers
+#
 # Usage:
+#   # Swarm tests (no LLM needed):
 #   FUNCTIONAL_DOCKER=1 pytest tests/functional/test_docker_swarm.py -v
 #
+#   # Full-stack with Ollama (pulls qwen2.5:0.5b automatically):
+#   FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v
+#
 # Or manually:
 #   docker compose -f docker-compose.test.yml -p timmy-test up -d --build --wait
 #   curl http://localhost:18000/health
@@ -13,6 +22,23 @@

 services:

+  # ── Ollama — local LLM for functional tests ───────────────────────────────
+  # Activated with: --profile ollama
+  # Uses a tiny model (qwen2.5:0.5b, ~400 MB) so it runs on CPU-only CI.
+  ollama:
+    image: ollama/ollama:latest
+    container_name: timmy-test-ollama
+    profiles:
+      - ollama
+    networks:
+      - swarm-test-net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+      interval: 5s
+      timeout: 5s
+      retries: 20
+      start_period: 10s
+
  dashboard:
    build: .
    image: timmy-time:test
@@ -26,7 +52,8 @@ services:
    environment:
      DEBUG: "true"
      TIMMY_TEST_MODE: "1"
-      OLLAMA_URL: "http://host.docker.internal:11434"
+      OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}"
+      OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}"
      LIGHTNING_BACKEND: "mock"
    extra_hosts:
      - "host.docker.internal:host-gateway"
@@ -49,7 +76,8 @@ services:
      - ./src:/app/src
    environment:
      COORDINATOR_URL: "http://dashboard:8000"
-      OLLAMA_URL: "http://host.docker.internal:11434"
+      OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}"
+      OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}"
      AGENT_NAME: "${AGENT_NAME:-TestWorker}"
      AGENT_CAPABILITIES: "${AGENT_CAPABILITIES:-general}"
      TIMMY_TEST_MODE: "1"
--- a/tests/functional/test_ollama_chat.py
+++ b/tests/functional/test_ollama_chat.py
@@ -0,0 +1,235 @@
+"""End-to-end tests with a real Ollama container.
+
+These tests spin up the full Docker stack **including** an Ollama container
+running a tiny model (qwen2.5:0.5b, ~400 MB, CPU-only).  They verify that
+the dashboard can actually generate LLM responses — not just degrade
+gracefully when Ollama is offline.
+
+Run with:
+    FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v
+
+Requirements:
+    - Docker daemon running
+    - ~1 GB free disk (Ollama image + model weights)
+    - No GPU required — qwen2.5:0.5b runs fine on CPU
+
+The ``ollama_stack`` fixture brings up Ollama + dashboard via docker compose,
+pulls the model, and tears everything down after the session.
+"""
+
+import os
+import subprocess
+import time
+from pathlib import Path
+
+import pytest
+
+httpx = pytest.importorskip("httpx")
+
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+COMPOSE_TEST = PROJECT_ROOT / "docker-compose.test.yml"
+
+# Tiny model that runs on CPU without GPU.  ~400 MB download.
+TEST_MODEL = os.environ.get("OLLAMA_TEST_MODEL", "qwen2.5:0.5b")
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+def _compose(*args, timeout=120):
+    """Run a docker compose command against the test compose file."""
+    cmd = [
+        "docker", "compose",
+        "-f", str(COMPOSE_TEST),
+        "-p", "timmy-test",
+        *args,
+    ]
+    return subprocess.run(
+        cmd, capture_output=True, text=True,
+        timeout=timeout, cwd=str(PROJECT_ROOT),
+    )
+
+
+def _wait_for_healthy(url: str, retries=40, interval=3):
+    """Poll *url* until it returns HTTP 200."""
+    for _ in range(retries):
+        try:
+            r = httpx.get(url, timeout=5)
+            if r.status_code == 200:
+                return True
+        except Exception:
+            pass
+        time.sleep(interval)
+    return False
+
+
+def _pull_model(model: str, retries=3):
+    """Ask the containerised Ollama to pull *model*."""
+    for attempt in range(retries):
+        result = subprocess.run(
+            [
+                "docker", "exec", "timmy-test-ollama",
+                "ollama", "pull", model,
+            ],
+            capture_output=True, text=True, timeout=600,
+        )
+        if result.returncode == 0:
+            return True
+        time.sleep(5 * (attempt + 1))
+    return False
+
+
+# ── fixtures ─────────────────────────────────────────────────────────────────
+
+
+@pytest.fixture(scope="session")
+def ollama_stack():
+    """Bring up Ollama + dashboard, pull the test model, yield base URL.
+
+    Skipped unless ``FUNCTIONAL_DOCKER=1`` is set and Docker is available.
+    """
+    if os.environ.get("FUNCTIONAL_DOCKER") != "1":
+        pytest.skip("Set FUNCTIONAL_DOCKER=1 to run Docker tests")
+    if not COMPOSE_TEST.exists():
+        pytest.skip("docker-compose.test.yml not found")
+
+    # Verify Docker daemon
+    docker_check = subprocess.run(
+        ["docker", "info"], capture_output=True, text=True, timeout=10,
+    )
+    if docker_check.returncode != 0:
+        pytest.skip(f"Docker daemon not available: {docker_check.stderr.strip()}")
+
+    # Bring up Ollama + dashboard with the ollama profile.
+    # OLLAMA_URL tells the dashboard to reach the sibling container.
+    env = {
+        **os.environ,
+        "OLLAMA_URL": "http://ollama:11434",
+        "OLLAMA_MODEL": TEST_MODEL,
+    }
+    result = subprocess.run(
+        [
+            "docker", "compose",
+            "-f", str(COMPOSE_TEST),
+            "-p", "timmy-test",
+            "--profile", "ollama",
+            "up", "-d", "--build", "--wait",
+        ],
+        capture_output=True, text=True, timeout=300,
+        cwd=str(PROJECT_ROOT), env=env,
+    )
+    if result.returncode != 0:
+        pytest.fail(f"docker compose up failed:\n{result.stderr}")
+
+    # Wait for Ollama to be ready
+    ollama_ready = _wait_for_healthy("http://localhost:18000/health")
+    if not ollama_ready:
+        logs = _compose("logs")
+        _compose("--profile", "ollama", "down", "-v")
+        pytest.fail(f"Stack never became healthy:\n{logs.stdout}")
+
+    # Pull the tiny test model into the Ollama container
+    if not _pull_model(TEST_MODEL):
+        logs = _compose("logs", "ollama")
+        _compose("--profile", "ollama", "down", "-v")
+        pytest.fail(f"Failed to pull {TEST_MODEL}:\n{logs.stdout}")
+
+    yield "http://localhost:18000"
+
+    # Teardown
+    subprocess.run(
+        [
+            "docker", "compose",
+            "-f", str(COMPOSE_TEST),
+            "-p", "timmy-test",
+            "--profile", "ollama",
+            "down", "-v",
+        ],
+        capture_output=True, text=True, timeout=60,
+        cwd=str(PROJECT_ROOT),
+    )
+
+
+# ── tests ────────────────────────────────────────────────────────────────────
+
+
+class TestOllamaHealth:
+    """Verify the dashboard can reach the Ollama container."""
+
+    def test_health_reports_ollama_up(self, ollama_stack):
+        """GET /health should report ollama as 'up'."""
+        resp = httpx.get(f"{ollama_stack}/health", timeout=10)
+        assert resp.status_code == 200
+        data = resp.json()
+        services = data.get("services", {})
+        assert services.get("ollama") == "up", (
+            f"Expected ollama=up, got: {services}"
+        )
+
+
+class TestOllamaChat:
+    """Send a real prompt through the dashboard and get an LLM response."""
+
+    def test_chat_returns_llm_response(self, ollama_stack):
+        """POST /agents/timmy/chat with a real message → non-error HTML."""
+        resp = httpx.post(
+            f"{ollama_stack}/agents/timmy/chat",
+            data={"message": "Say hello in exactly three words."},
+            timeout=120,  # first inference can be slow on CPU
+        )
+        assert resp.status_code == 200
+        body = resp.text.lower()
+        # The response should contain actual content, not an error fallback
+        assert "error" not in body or "hello" in body, (
+            f"Expected LLM response, got error:\n{resp.text[:500]}"
+        )
+
+    def test_chat_history_contains_response(self, ollama_stack):
+        """After chatting, history should include both user and agent messages."""
+        # Send a message
+        httpx.post(
+            f"{ollama_stack}/agents/timmy/chat",
+            data={"message": "What is 2+2?"},
+            timeout=120,
+        )
+        # Fetch history
+        hist = httpx.get(f"{ollama_stack}/agents/timmy/history", timeout=10)
+        assert hist.status_code == 200
+        body = hist.text.lower()
+        assert "2+2" in body or "2 + 2" in body
+
+    def test_multiple_turns(self, ollama_stack):
+        """Verify the agent handles a second turn without crashing."""
+        # First turn
+        r1 = httpx.post(
+            f"{ollama_stack}/agents/timmy/chat",
+            data={"message": "Remember the word: pineapple"},
+            timeout=120,
+        )
+        assert r1.status_code == 200
+
+        # Second turn
+        r2 = httpx.post(
+            f"{ollama_stack}/agents/timmy/chat",
+            data={"message": "What word did I just ask you to remember?"},
+            timeout=120,
+        )
+        assert r2.status_code == 200
+        # We don't assert "pineapple" — tiny models have weak memory.
+        # The point is it doesn't crash on multi-turn.
+
+
+class TestOllamaDirectAPI:
+    """Hit the Ollama container directly to verify the model is loaded."""
+
+    def test_ollama_api_tags(self, ollama_stack):
+        """Ollama /api/tags should list the pulled test model."""
+        # Ollama isn't port-mapped, so we exec into the container
+        result = subprocess.run(
+            [
+                "docker", "exec", "timmy-test-ollama",
+                "curl", "-sf", "http://localhost:11434/api/tags",
+            ],
+            capture_output=True, text=True, timeout=10,
+        )
+        assert result.returncode == 0
+        assert TEST_MODEL.split(":")[0] in result.stdout