forked from Rockachopa/Timmy-time-dashboard
feat: add functional Ollama chat tests with containerised LLM
Add an ollama service (behind --profile ollama) to the test compose stack and a new test suite that verifies real LLM inference end-to-end: - docker-compose.test.yml: add ollama/ollama service with health check, make OLLAMA_URL and OLLAMA_MODEL configurable via env vars - tests/functional/test_ollama_chat.py: session-scoped fixture that brings up Ollama + dashboard, pulls qwen2.5:0.5b (~400MB, CPU-only), and runs chat/history/multi-turn tests against the live stack - Makefile: add `make test-ollama` target Run with: make test-ollama (or FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v) https://claude.ai/code/session_01NTEzfRHSZQCfkfypxgyHKk
This commit is contained in:
6
Makefile
6
Makefile
@@ -62,6 +62,12 @@ test-cov-html:
|
||||
$(PYTEST) tests/ --cov=src --cov-report=term-missing --cov-report=html -q
|
||||
@echo "✓ HTML coverage report: open htmlcov/index.html"
|
||||
|
||||
# Full-stack functional test: spins up Ollama (CPU, qwen2.5:0.5b) + dashboard
|
||||
# in Docker and verifies real LLM chat end-to-end.
|
||||
# Override model: make test-ollama OLLAMA_TEST_MODEL=tinyllama
|
||||
test-ollama:
|
||||
FUNCTIONAL_DOCKER=1 $(PYTEST) tests/functional/test_ollama_chat.py -v --tb=long -x
|
||||
|
||||
# ── Code quality ──────────────────────────────────────────────────────────────
|
||||
|
||||
lint:
|
||||
|
||||
@@ -3,9 +3,18 @@
|
||||
# Lightweight compose for functional tests. Runs the dashboard on port 18000
|
||||
# and optional agent workers on the swarm-test-net network.
|
||||
#
|
||||
# Profiles:
|
||||
# (default) dashboard only (Ollama on host via host.docker.internal)
|
||||
# ollama adds a containerised Ollama instance + auto model pull
|
||||
# agents adds scalable agent workers
|
||||
#
|
||||
# Usage:
|
||||
# # Swarm tests (no LLM needed):
|
||||
# FUNCTIONAL_DOCKER=1 pytest tests/functional/test_docker_swarm.py -v
|
||||
#
|
||||
# # Full-stack with Ollama (pulls qwen2.5:0.5b automatically):
|
||||
# FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v
|
||||
#
|
||||
# Or manually:
|
||||
# docker compose -f docker-compose.test.yml -p timmy-test up -d --build --wait
|
||||
# curl http://localhost:18000/health
|
||||
@@ -13,6 +22,23 @@
|
||||
|
||||
services:
|
||||
|
||||
# ── Ollama — local LLM for functional tests ───────────────────────────────
|
||||
# Activated with: --profile ollama
|
||||
# Uses a tiny model (qwen2.5:0.5b, ~400 MB) so it runs on CPU-only CI.
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
container_name: timmy-test-ollama
|
||||
profiles:
|
||||
- ollama
|
||||
networks:
|
||||
- swarm-test-net
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 20
|
||||
start_period: 10s
|
||||
|
||||
dashboard:
|
||||
build: .
|
||||
image: timmy-time:test
|
||||
@@ -26,7 +52,8 @@ services:
|
||||
environment:
|
||||
DEBUG: "true"
|
||||
TIMMY_TEST_MODE: "1"
|
||||
OLLAMA_URL: "http://host.docker.internal:11434"
|
||||
OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}"
|
||||
OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}"
|
||||
LIGHTNING_BACKEND: "mock"
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
@@ -49,7 +76,8 @@ services:
|
||||
- ./src:/app/src
|
||||
environment:
|
||||
COORDINATOR_URL: "http://dashboard:8000"
|
||||
OLLAMA_URL: "http://host.docker.internal:11434"
|
||||
OLLAMA_URL: "${OLLAMA_URL:-http://host.docker.internal:11434}"
|
||||
OLLAMA_MODEL: "${OLLAMA_MODEL:-llama3.2}"
|
||||
AGENT_NAME: "${AGENT_NAME:-TestWorker}"
|
||||
AGENT_CAPABILITIES: "${AGENT_CAPABILITIES:-general}"
|
||||
TIMMY_TEST_MODE: "1"
|
||||
|
||||
235
tests/functional/test_ollama_chat.py
Normal file
235
tests/functional/test_ollama_chat.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""End-to-end tests with a real Ollama container.
|
||||
|
||||
These tests spin up the full Docker stack **including** an Ollama container
|
||||
running a tiny model (qwen2.5:0.5b, ~400 MB, CPU-only). They verify that
|
||||
the dashboard can actually generate LLM responses — not just degrade
|
||||
gracefully when Ollama is offline.
|
||||
|
||||
Run with:
|
||||
FUNCTIONAL_DOCKER=1 pytest tests/functional/test_ollama_chat.py -v
|
||||
|
||||
Requirements:
|
||||
- Docker daemon running
|
||||
- ~1 GB free disk (Ollama image + model weights)
|
||||
- No GPU required — qwen2.5:0.5b runs fine on CPU
|
||||
|
||||
The ``ollama_stack`` fixture brings up Ollama + dashboard via docker compose,
|
||||
pulls the model, and tears everything down after the session.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
httpx = pytest.importorskip("httpx")
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
||||
COMPOSE_TEST = PROJECT_ROOT / "docker-compose.test.yml"
|
||||
|
||||
# Tiny model that runs on CPU without GPU. ~400 MB download.
|
||||
TEST_MODEL = os.environ.get("OLLAMA_TEST_MODEL", "qwen2.5:0.5b")
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _compose(*args, timeout=120):
|
||||
"""Run a docker compose command against the test compose file."""
|
||||
cmd = [
|
||||
"docker", "compose",
|
||||
"-f", str(COMPOSE_TEST),
|
||||
"-p", "timmy-test",
|
||||
*args,
|
||||
]
|
||||
return subprocess.run(
|
||||
cmd, capture_output=True, text=True,
|
||||
timeout=timeout, cwd=str(PROJECT_ROOT),
|
||||
)
|
||||
|
||||
|
||||
def _wait_for_healthy(url: str, retries=40, interval=3):
|
||||
"""Poll *url* until it returns HTTP 200."""
|
||||
for _ in range(retries):
|
||||
try:
|
||||
r = httpx.get(url, timeout=5)
|
||||
if r.status_code == 200:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(interval)
|
||||
return False
|
||||
|
||||
|
||||
def _pull_model(model: str, retries=3):
|
||||
"""Ask the containerised Ollama to pull *model*."""
|
||||
for attempt in range(retries):
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker", "exec", "timmy-test-ollama",
|
||||
"ollama", "pull", model,
|
||||
],
|
||||
capture_output=True, text=True, timeout=600,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
time.sleep(5 * (attempt + 1))
|
||||
return False
|
||||
|
||||
|
||||
# ── fixtures ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ollama_stack():
|
||||
"""Bring up Ollama + dashboard, pull the test model, yield base URL.
|
||||
|
||||
Skipped unless ``FUNCTIONAL_DOCKER=1`` is set and Docker is available.
|
||||
"""
|
||||
if os.environ.get("FUNCTIONAL_DOCKER") != "1":
|
||||
pytest.skip("Set FUNCTIONAL_DOCKER=1 to run Docker tests")
|
||||
if not COMPOSE_TEST.exists():
|
||||
pytest.skip("docker-compose.test.yml not found")
|
||||
|
||||
# Verify Docker daemon
|
||||
docker_check = subprocess.run(
|
||||
["docker", "info"], capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
if docker_check.returncode != 0:
|
||||
pytest.skip(f"Docker daemon not available: {docker_check.stderr.strip()}")
|
||||
|
||||
# Bring up Ollama + dashboard with the ollama profile.
|
||||
# OLLAMA_URL tells the dashboard to reach the sibling container.
|
||||
env = {
|
||||
**os.environ,
|
||||
"OLLAMA_URL": "http://ollama:11434",
|
||||
"OLLAMA_MODEL": TEST_MODEL,
|
||||
}
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker", "compose",
|
||||
"-f", str(COMPOSE_TEST),
|
||||
"-p", "timmy-test",
|
||||
"--profile", "ollama",
|
||||
"up", "-d", "--build", "--wait",
|
||||
],
|
||||
capture_output=True, text=True, timeout=300,
|
||||
cwd=str(PROJECT_ROOT), env=env,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.fail(f"docker compose up failed:\n{result.stderr}")
|
||||
|
||||
# Wait for Ollama to be ready
|
||||
ollama_ready = _wait_for_healthy("http://localhost:18000/health")
|
||||
if not ollama_ready:
|
||||
logs = _compose("logs")
|
||||
_compose("--profile", "ollama", "down", "-v")
|
||||
pytest.fail(f"Stack never became healthy:\n{logs.stdout}")
|
||||
|
||||
# Pull the tiny test model into the Ollama container
|
||||
if not _pull_model(TEST_MODEL):
|
||||
logs = _compose("logs", "ollama")
|
||||
_compose("--profile", "ollama", "down", "-v")
|
||||
pytest.fail(f"Failed to pull {TEST_MODEL}:\n{logs.stdout}")
|
||||
|
||||
yield "http://localhost:18000"
|
||||
|
||||
# Teardown
|
||||
subprocess.run(
|
||||
[
|
||||
"docker", "compose",
|
||||
"-f", str(COMPOSE_TEST),
|
||||
"-p", "timmy-test",
|
||||
"--profile", "ollama",
|
||||
"down", "-v",
|
||||
],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
cwd=str(PROJECT_ROOT),
|
||||
)
|
||||
|
||||
|
||||
# ── tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestOllamaHealth:
|
||||
"""Verify the dashboard can reach the Ollama container."""
|
||||
|
||||
def test_health_reports_ollama_up(self, ollama_stack):
|
||||
"""GET /health should report ollama as 'up'."""
|
||||
resp = httpx.get(f"{ollama_stack}/health", timeout=10)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
services = data.get("services", {})
|
||||
assert services.get("ollama") == "up", (
|
||||
f"Expected ollama=up, got: {services}"
|
||||
)
|
||||
|
||||
|
||||
class TestOllamaChat:
|
||||
"""Send a real prompt through the dashboard and get an LLM response."""
|
||||
|
||||
def test_chat_returns_llm_response(self, ollama_stack):
|
||||
"""POST /agents/timmy/chat with a real message → non-error HTML."""
|
||||
resp = httpx.post(
|
||||
f"{ollama_stack}/agents/timmy/chat",
|
||||
data={"message": "Say hello in exactly three words."},
|
||||
timeout=120, # first inference can be slow on CPU
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
body = resp.text.lower()
|
||||
# The response should contain actual content, not an error fallback
|
||||
assert "error" not in body or "hello" in body, (
|
||||
f"Expected LLM response, got error:\n{resp.text[:500]}"
|
||||
)
|
||||
|
||||
def test_chat_history_contains_response(self, ollama_stack):
|
||||
"""After chatting, history should include both user and agent messages."""
|
||||
# Send a message
|
||||
httpx.post(
|
||||
f"{ollama_stack}/agents/timmy/chat",
|
||||
data={"message": "What is 2+2?"},
|
||||
timeout=120,
|
||||
)
|
||||
# Fetch history
|
||||
hist = httpx.get(f"{ollama_stack}/agents/timmy/history", timeout=10)
|
||||
assert hist.status_code == 200
|
||||
body = hist.text.lower()
|
||||
assert "2+2" in body or "2 + 2" in body
|
||||
|
||||
def test_multiple_turns(self, ollama_stack):
|
||||
"""Verify the agent handles a second turn without crashing."""
|
||||
# First turn
|
||||
r1 = httpx.post(
|
||||
f"{ollama_stack}/agents/timmy/chat",
|
||||
data={"message": "Remember the word: pineapple"},
|
||||
timeout=120,
|
||||
)
|
||||
assert r1.status_code == 200
|
||||
|
||||
# Second turn
|
||||
r2 = httpx.post(
|
||||
f"{ollama_stack}/agents/timmy/chat",
|
||||
data={"message": "What word did I just ask you to remember?"},
|
||||
timeout=120,
|
||||
)
|
||||
assert r2.status_code == 200
|
||||
# We don't assert "pineapple" — tiny models have weak memory.
|
||||
# The point is it doesn't crash on multi-turn.
|
||||
|
||||
|
||||
class TestOllamaDirectAPI:
|
||||
"""Hit the Ollama container directly to verify the model is loaded."""
|
||||
|
||||
def test_ollama_api_tags(self, ollama_stack):
|
||||
"""Ollama /api/tags should list the pulled test model."""
|
||||
# Ollama isn't port-mapped, so we exec into the container
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker", "exec", "timmy-test-ollama",
|
||||
"curl", "-sf", "http://localhost:11434/api/tags",
|
||||
],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
assert result.returncode == 0
|
||||
assert TEST_MODEL.split(":")[0] in result.stdout
|
||||
Reference in New Issue
Block a user