From 78cf91697cad273a3012a06e7a91420eefa62279 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Feb 2026 02:44:36 +0000 Subject: [PATCH] feat: add functional Ollama chat tests with containerised LLM Add an ollama service (behind --profile ollama) to the test compose stack and a new test suite that verifies real LLM inference end-to-end: - docker-compose.test.yml: add ollama/ollama service with health check, make OLLAMA_URL and OLLAMA_MODEL configurable via env vars - tests/functional/test_ollama_chat.py: session-scoped fixture that brings up Ollama + dashboard, pulls qwen2.5:0.5b (~400MB, CPU-only), and runs chat/history/multi-turn tests against the live stack - Makefile: add `make test-ollama` target Run with: make test-ollama (or FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v) https://claude.ai/code/session_01NTEzfRHSZQCfkfypxgyHKk --- Makefile | 6 + docker-compose.test.yml | 32 +++- tests/functional/test_ollama_chat.py | 235 +++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 2 deletions(-) create mode 100644 tests/functional/test_ollama_chat.py diff --git a/Makefile b/Makefile index 12ec20d2..744bfaea 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,12 @@ test-cov-html: $(PYTEST) tests/ --cov=src --cov-report=term-missing --cov-report=html -q @echo "✓ HTML coverage report: open htmlcov/index.html" +# Full-stack functional test: spins up Ollama (CPU, qwen2.5:0.5b) + dashboard +# in Docker and verifies real LLM chat end-to-end. +# Override model: make test-ollama OLLAMA_TEST_MODEL=tinyllama +test-ollama: + FUNCTIONAL_DOCKER=1 $(PYTEST) tests/functional/test_ollama_chat.py -v --tb=long -x + # ── Code quality ────────────────────────────────────────────────────────────── lint: diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 517dbdb2..6e58264a 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -3,9 +3,18 @@ # Lightweight compose for functional tests. Runs the dashboard on port 18000 # and optional agent workers on the swarm-test-net network. # +# Profiles: +# (default) dashboard only (Ollama on host via host.docker.internal) +# ollama adds a containerised Ollama instance + auto model pull +# agents adds scalable agent workers +# # Usage: +# # Swarm tests (no LLM needed): # FUNCTIONAL_DOCKER=1 pytest tests/functional/test_docker_swarm.py -v # +# # Full-stack with Ollama (pulls qwen2.5:0.5b automatically): +# FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v +# # Or manually: # docker compose -f docker-compose.test.yml -p timmy-test up -d --build --wait # curl http://localhost:18000/health @@ -13,6 +22,23 @@ services: + # ── Ollama — local LLM for functional tests ─────────────────────────────── + # Activated with: --profile ollama + # Uses a tiny model (qwen2.5:0.5b, ~400 MB) so it runs on CPU-only CI. + ollama: + image: ollama/ollama:latest + container_name: timmy-test-ollama + profiles: + - ollama + networks: + - swarm-test-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] + interval: 5s + timeout: 5s + retries: 20 + start_period: 10s + dashboard: build: . image: timmy-time:test @@ -26,7 +52,8 @@ services: environment: DEBUG: "true" TIMMY_TEST_MODE: "1" - OLLAMA_URL: "http://host.docker.internal:11434" + OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}" + OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}" LIGHTNING_BACKEND: "mock" extra_hosts: - "host.docker.internal:host-gateway" @@ -49,7 +76,8 @@ services: - ./src:/app/src environment: COORDINATOR_URL: "http://dashboard:8000" - OLLAMA_URL: "http://host.docker.internal:11434" + OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}" + OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}" AGENT_NAME: "${AGENT_NAME:-TestWorker}" AGENT_CAPABILITIES: "${AGENT_CAPABILITIES:-general}" TIMMY_TEST_MODE: "1" diff --git a/tests/functional/test_ollama_chat.py b/tests/functional/test_ollama_chat.py new file mode 100644 index 00000000..03990856 --- /dev/null +++ b/tests/functional/test_ollama_chat.py @@ -0,0 +1,235 @@ +"""End-to-end tests with a real Ollama container. + +These tests spin up the full Docker stack **including** an Ollama container +running a tiny model (qwen2.5:0.5b, ~400 MB, CPU-only). They verify that +the dashboard can actually generate LLM responses — not just degrade +gracefully when Ollama is offline. + +Run with: + FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v + +Requirements: + - Docker daemon running + - ~1 GB free disk (Ollama image + model weights) + - No GPU required — qwen2.5:0.5b runs fine on CPU + +The ``ollama_stack`` fixture brings up Ollama + dashboard via docker compose, +pulls the model, and tears everything down after the session. +""" + +import os +import subprocess +import time +from pathlib import Path + +import pytest + +httpx = pytest.importorskip("httpx") + +PROJECT_ROOT = Path(__file__).parent.parent.parent +COMPOSE_TEST = PROJECT_ROOT / "docker-compose.test.yml" + +# Tiny model that runs on CPU without GPU. ~400 MB download. +TEST_MODEL = os.environ.get("OLLAMA_TEST_MODEL", "qwen2.5:0.5b") + +# ── helpers ────────────────────────────────────────────────────────────────── + + +def _compose(*args, timeout=120): + """Run a docker compose command against the test compose file.""" + cmd = [ + "docker", "compose", + "-f", str(COMPOSE_TEST), + "-p", "timmy-test", + *args, + ] + return subprocess.run( + cmd, capture_output=True, text=True, + timeout=timeout, cwd=str(PROJECT_ROOT), + ) + + +def _wait_for_healthy(url: str, retries=40, interval=3): + """Poll *url* until it returns HTTP 200.""" + for _ in range(retries): + try: + r = httpx.get(url, timeout=5) + if r.status_code == 200: + return True + except Exception: + pass + time.sleep(interval) + return False + + +def _pull_model(model: str, retries=3): + """Ask the containerised Ollama to pull *model*.""" + for attempt in range(retries): + result = subprocess.run( + [ + "docker", "exec", "timmy-test-ollama", + "ollama", "pull", model, + ], + capture_output=True, text=True, timeout=600, + ) + if result.returncode == 0: + return True + time.sleep(5 * (attempt + 1)) + return False + + +# ── fixtures ───────────────────────────────────────────────────────────────── + + +@pytest.fixture(scope="session") +def ollama_stack(): + """Bring up Ollama + dashboard, pull the test model, yield base URL. + + Skipped unless ``FUNCTIONAL_DOCKER=1`` is set and Docker is available. + """ + if os.environ.get("FUNCTIONAL_DOCKER") != "1": + pytest.skip("Set FUNCTIONAL_DOCKER=1 to run Docker tests") + if not COMPOSE_TEST.exists(): + pytest.skip("docker-compose.test.yml not found") + + # Verify Docker daemon + docker_check = subprocess.run( + ["docker", "info"], capture_output=True, text=True, timeout=10, + ) + if docker_check.returncode != 0: + pytest.skip(f"Docker daemon not available: {docker_check.stderr.strip()}") + + # Bring up Ollama + dashboard with the ollama profile. + # OLLAMA_URL tells the dashboard to reach the sibling container. + env = { + **os.environ, + "OLLAMA_URL": "http://ollama:11434", + "OLLAMA_MODEL": TEST_MODEL, + } + result = subprocess.run( + [ + "docker", "compose", + "-f", str(COMPOSE_TEST), + "-p", "timmy-test", + "--profile", "ollama", + "up", "-d", "--build", "--wait", + ], + capture_output=True, text=True, timeout=300, + cwd=str(PROJECT_ROOT), env=env, + ) + if result.returncode != 0: + pytest.fail(f"docker compose up failed:\n{result.stderr}") + + # Wait for Ollama to be ready + ollama_ready = _wait_for_healthy("http://localhost:18000/health") + if not ollama_ready: + logs = _compose("logs") + _compose("--profile", "ollama", "down", "-v") + pytest.fail(f"Stack never became healthy:\n{logs.stdout}") + + # Pull the tiny test model into the Ollama container + if not _pull_model(TEST_MODEL): + logs = _compose("logs", "ollama") + _compose("--profile", "ollama", "down", "-v") + pytest.fail(f"Failed to pull {TEST_MODEL}:\n{logs.stdout}") + + yield "http://localhost:18000" + + # Teardown + subprocess.run( + [ + "docker", "compose", + "-f", str(COMPOSE_TEST), + "-p", "timmy-test", + "--profile", "ollama", + "down", "-v", + ], + capture_output=True, text=True, timeout=60, + cwd=str(PROJECT_ROOT), + ) + + +# ── tests ──────────────────────────────────────────────────────────────────── + + +class TestOllamaHealth: + """Verify the dashboard can reach the Ollama container.""" + + def test_health_reports_ollama_up(self, ollama_stack): + """GET /health should report ollama as 'up'.""" + resp = httpx.get(f"{ollama_stack}/health", timeout=10) + assert resp.status_code == 200 + data = resp.json() + services = data.get("services", {}) + assert services.get("ollama") == "up", ( + f"Expected ollama=up, got: {services}" + ) + + +class TestOllamaChat: + """Send a real prompt through the dashboard and get an LLM response.""" + + def test_chat_returns_llm_response(self, ollama_stack): + """POST /agents/timmy/chat with a real message → non-error HTML.""" + resp = httpx.post( + f"{ollama_stack}/agents/timmy/chat", + data={"message": "Say hello in exactly three words."}, + timeout=120, # first inference can be slow on CPU + ) + assert resp.status_code == 200 + body = resp.text.lower() + # The response should contain actual content, not an error fallback + assert "error" not in body or "hello" in body, ( + f"Expected LLM response, got error:\n{resp.text[:500]}" + ) + + def test_chat_history_contains_response(self, ollama_stack): + """After chatting, history should include both user and agent messages.""" + # Send a message + httpx.post( + f"{ollama_stack}/agents/timmy/chat", + data={"message": "What is 2+2?"}, + timeout=120, + ) + # Fetch history + hist = httpx.get(f"{ollama_stack}/agents/timmy/history", timeout=10) + assert hist.status_code == 200 + body = hist.text.lower() + assert "2+2" in body or "2 + 2" in body + + def test_multiple_turns(self, ollama_stack): + """Verify the agent handles a second turn without crashing.""" + # First turn + r1 = httpx.post( + f"{ollama_stack}/agents/timmy/chat", + data={"message": "Remember the word: pineapple"}, + timeout=120, + ) + assert r1.status_code == 200 + + # Second turn + r2 = httpx.post( + f"{ollama_stack}/agents/timmy/chat", + data={"message": "What word did I just ask you to remember?"}, + timeout=120, + ) + assert r2.status_code == 200 + # We don't assert "pineapple" — tiny models have weak memory. + # The point is it doesn't crash on multi-turn. + + +class TestOllamaDirectAPI: + """Hit the Ollama container directly to verify the model is loaded.""" + + def test_ollama_api_tags(self, ollama_stack): + """Ollama /api/tags should list the pulled test model.""" + # Ollama isn't port-mapped, so we exec into the container + result = subprocess.run( + [ + "docker", "exec", "timmy-test-ollama", + "curl", "-sf", "http://localhost:11434/api/tags", + ], + capture_output=True, text=True, timeout=10, + ) + assert result.returncode == 0 + assert TEST_MODEL.split(":")[0] in result.stdout