forked from Rockachopa/Timmy-time-dashboard
Compare commits
1 Commits
claude/iss
...
claude/iss
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28d1905df4 |
@@ -150,6 +150,7 @@ async def transcribe_audio(audio: bytes) -> str:
|
|||||||
| Service | When Unavailable | Fallback Behavior |
|
| Service | When Unavailable | Fallback Behavior |
|
||||||
|---------|------------------|-------------------|
|
|---------|------------------|-------------------|
|
||||||
| Ollama | No local LLM | Claude backend (if ANTHROPIC_API_KEY set) |
|
| Ollama | No local LLM | Claude backend (if ANTHROPIC_API_KEY set) |
|
||||||
|
| vLLM | Server not running | Ollama backend (cascade router fallback) |
|
||||||
| Redis | Cache/storage down | In-memory dict (ephemeral) |
|
| Redis | Cache/storage down | In-memory dict (ephemeral) |
|
||||||
| AirLLM | Import error or no Apple Silicon | Ollama backend |
|
| AirLLM | Import error or no Apple Silicon | Ollama backend |
|
||||||
| Voice (Piper) | Service down | Browser Web Speech API |
|
| Voice (Piper) | Service down | Browser Web Speech API |
|
||||||
|
|||||||
@@ -131,11 +131,34 @@ providers:
|
|||||||
context_window: 32000
|
context_window: 32000
|
||||||
capabilities: [text, tools, json, streaming]
|
capabilities: [text, tools, json, streaming]
|
||||||
|
|
||||||
# Tertiary: OpenAI (if API key available)
|
# Tertiary: vLLM (OpenAI-compatible, continuous batching, 3-10x agentic throughput)
|
||||||
|
# Runs on CUDA GPU or CPU. On Apple Silicon, prefer vllm-mlx-local (above).
|
||||||
|
# To enable: start vLLM server:
|
||||||
|
# python -m vllm.entrypoints.openai.api_server \
|
||||||
|
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
|
||||||
|
# Then set enabled: true (or TIMMY_LLM_BACKEND=vllm + VLLM_URL=http://localhost:8001)
|
||||||
|
- name: vllm-local
|
||||||
|
type: vllm
|
||||||
|
enabled: false # Enable when vLLM server is running
|
||||||
|
priority: 3
|
||||||
|
tier: local
|
||||||
|
base_url: "http://localhost:8001/v1"
|
||||||
|
models:
|
||||||
|
- name: Qwen/Qwen2.5-14B-Instruct
|
||||||
|
default: true
|
||||||
|
context_window: 32000
|
||||||
|
capabilities: [text, tools, json, streaming, complex]
|
||||||
|
description: "Qwen2.5-14B on vLLM — continuous batching for agentic workloads"
|
||||||
|
- name: Qwen/Qwen2.5-7B-Instruct
|
||||||
|
context_window: 32000
|
||||||
|
capabilities: [text, tools, json, streaming, routine]
|
||||||
|
description: "Qwen2.5-7B on vLLM — fast model for routine tasks"
|
||||||
|
|
||||||
|
# Quinary: OpenAI (if API key available)
|
||||||
- name: openai-backup
|
- name: openai-backup
|
||||||
type: openai
|
type: openai
|
||||||
enabled: false # Enable by setting OPENAI_API_KEY
|
enabled: false # Enable by setting OPENAI_API_KEY
|
||||||
priority: 3
|
priority: 4
|
||||||
tier: standard_cloud
|
tier: standard_cloud
|
||||||
api_key: "${OPENAI_API_KEY}" # Loaded from environment
|
api_key: "${OPENAI_API_KEY}" # Loaded from environment
|
||||||
base_url: null # Use default OpenAI endpoint
|
base_url: null # Use default OpenAI endpoint
|
||||||
@@ -147,12 +170,12 @@ providers:
|
|||||||
- name: gpt-4o
|
- name: gpt-4o
|
||||||
context_window: 128000
|
context_window: 128000
|
||||||
capabilities: [text, vision, tools, json, streaming]
|
capabilities: [text, vision, tools, json, streaming]
|
||||||
|
|
||||||
# Quaternary: Anthropic (if API key available)
|
# Senary: Anthropic (if API key available)
|
||||||
- name: anthropic-backup
|
- name: anthropic-backup
|
||||||
type: anthropic
|
type: anthropic
|
||||||
enabled: false # Enable by setting ANTHROPIC_API_KEY
|
enabled: false # Enable by setting ANTHROPIC_API_KEY
|
||||||
priority: 4
|
priority: 5
|
||||||
tier: frontier
|
tier: frontier
|
||||||
api_key: "${ANTHROPIC_API_KEY}"
|
api_key: "${ANTHROPIC_API_KEY}"
|
||||||
models:
|
models:
|
||||||
|
|||||||
@@ -42,6 +42,10 @@ services:
|
|||||||
GROK_ENABLED: "${GROK_ENABLED:-false}"
|
GROK_ENABLED: "${GROK_ENABLED:-false}"
|
||||||
XAI_API_KEY: "${XAI_API_KEY:-}"
|
XAI_API_KEY: "${XAI_API_KEY:-}"
|
||||||
GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}"
|
GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}"
|
||||||
|
# vLLM backend — set TIMMY_LLM_BACKEND=vllm to activate
|
||||||
|
TIMMY_LLM_BACKEND: "${TIMMY_LLM_BACKEND:-ollama}"
|
||||||
|
VLLM_URL: "${VLLM_URL:-http://localhost:8001}"
|
||||||
|
VLLM_MODEL: "${VLLM_MODEL:-Qwen/Qwen2.5-14B-Instruct}"
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway" # Linux: maps to host IP
|
- "host.docker.internal:host-gateway" # Linux: maps to host IP
|
||||||
networks:
|
networks:
|
||||||
@@ -74,6 +78,49 @@ services:
|
|||||||
profiles:
|
profiles:
|
||||||
- celery
|
- celery
|
||||||
|
|
||||||
|
# ── vLLM — high-throughput inference server (GPU optional) ──────────────
|
||||||
|
# Requires the 'vllm' profile: docker compose --profile vllm up
|
||||||
|
#
|
||||||
|
# GPU (NVIDIA): set VLLM_MODEL and ensure nvidia-container-toolkit is installed.
|
||||||
|
# CPU-only: add --device cpu to VLLM_EXTRA_ARGS (slower, but works anywhere).
|
||||||
|
#
|
||||||
|
# The dashboard reaches vLLM at http://vllm:8001 (inside timmy-net).
|
||||||
|
# Set VLLM_URL=http://vllm:8001 in the dashboard environment when using this service.
|
||||||
|
vllm:
|
||||||
|
image: vllm/vllm-openai:latest
|
||||||
|
container_name: timmy-vllm
|
||||||
|
profiles:
|
||||||
|
- vllm
|
||||||
|
ports:
|
||||||
|
- "8001:8001"
|
||||||
|
environment:
|
||||||
|
# Model to load — override with VLLM_MODEL env var
|
||||||
|
VLLM_MODEL: "${VLLM_MODEL:-Qwen/Qwen2.5-7B-Instruct}"
|
||||||
|
command: >
|
||||||
|
--model ${VLLM_MODEL:-Qwen/Qwen2.5-7B-Instruct}
|
||||||
|
--port 8001
|
||||||
|
--host 0.0.0.0
|
||||||
|
${VLLM_EXTRA_ARGS:-}
|
||||||
|
volumes:
|
||||||
|
- vllm-cache:/root/.cache/huggingface
|
||||||
|
networks:
|
||||||
|
- timmy-net
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
|
start_period: 120s
|
||||||
|
# GPU support — uncomment to enable NVIDIA GPU passthrough
|
||||||
|
# deploy:
|
||||||
|
# resources:
|
||||||
|
# reservations:
|
||||||
|
# devices:
|
||||||
|
# - driver: nvidia
|
||||||
|
# count: all
|
||||||
|
# capabilities: [gpu]
|
||||||
|
|
||||||
# ── OpenFang — vendored agent runtime sidecar ────────────────────────────
|
# ── OpenFang — vendored agent runtime sidecar ────────────────────────────
|
||||||
openfang:
|
openfang:
|
||||||
build:
|
build:
|
||||||
@@ -110,6 +157,8 @@ volumes:
|
|||||||
device: "${PWD}/data"
|
device: "${PWD}/data"
|
||||||
openfang-data:
|
openfang-data:
|
||||||
driver: local
|
driver: local
|
||||||
|
vllm-cache:
|
||||||
|
driver: local
|
||||||
|
|
||||||
# ── Internal network ────────────────────────────────────────────────────────
|
# ── Internal network ────────────────────────────────────────────────────────
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -94,8 +94,18 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
# ── Backend selection ────────────────────────────────────────────────────
|
# ── Backend selection ────────────────────────────────────────────────────
|
||||||
# "ollama" — always use Ollama (default, safe everywhere)
|
# "ollama" — always use Ollama (default, safe everywhere)
|
||||||
|
# "vllm" — use vLLM inference server (OpenAI-compatible, faster throughput)
|
||||||
# "auto" — pick best available local backend, fall back to Ollama
|
# "auto" — pick best available local backend, fall back to Ollama
|
||||||
timmy_model_backend: Literal["ollama", "grok", "claude", "auto"] = "ollama"
|
timmy_model_backend: Literal["ollama", "vllm", "grok", "claude", "auto"] = "ollama"
|
||||||
|
|
||||||
|
# ── vLLM backend ──────────────────────────────────────────────────────────
|
||||||
|
# vLLM is an OpenAI-compatible inference server optimised for continuous
|
||||||
|
# batching — 3–10x higher throughput than Ollama for agentic workloads.
|
||||||
|
# Start server: python -m vllm.entrypoints.openai.api_server \
|
||||||
|
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
|
||||||
|
# Then set TIMMY_LLM_BACKEND=vllm (or enable vllm-local in providers.yaml)
|
||||||
|
vllm_url: str = "http://localhost:8001"
|
||||||
|
vllm_model: str = "Qwen/Qwen2.5-14B-Instruct"
|
||||||
|
|
||||||
# ── Grok (xAI) — opt-in premium cloud backend ────────────────────────
|
# ── Grok (xAI) — opt-in premium cloud backend ────────────────────────
|
||||||
# Grok is a premium augmentation layer — local-first ethos preserved.
|
# Grok is a premium augmentation layer — local-first ethos preserved.
|
||||||
|
|||||||
@@ -124,6 +124,73 @@ async def check_ollama() -> bool:
|
|||||||
return dep.status == "healthy"
|
return dep.status == "healthy"
|
||||||
|
|
||||||
|
|
||||||
|
# vLLM health cache (30-second TTL)
|
||||||
|
_vllm_cache: DependencyStatus | None = None
|
||||||
|
_vllm_cache_ts: float = 0.0
|
||||||
|
_VLLM_CACHE_TTL = 30.0
|
||||||
|
|
||||||
|
|
||||||
|
def _check_vllm_sync() -> DependencyStatus:
|
||||||
|
"""Synchronous vLLM check — run via asyncio.to_thread()."""
|
||||||
|
try:
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
base_url = settings.vllm_url.rstrip("/")
|
||||||
|
# vLLM exposes /health at the server root (strip /v1 if present)
|
||||||
|
if base_url.endswith("/v1"):
|
||||||
|
base_url = base_url[:-3]
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{base_url}/health",
|
||||||
|
method="GET",
|
||||||
|
headers={"Accept": "application/json"},
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=2) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
return DependencyStatus(
|
||||||
|
name="vLLM",
|
||||||
|
status="healthy",
|
||||||
|
sovereignty_score=10,
|
||||||
|
details={"url": settings.vllm_url, "model": settings.vllm_model},
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("vLLM health check failed: %s", exc)
|
||||||
|
|
||||||
|
return DependencyStatus(
|
||||||
|
name="vLLM",
|
||||||
|
status="unavailable",
|
||||||
|
sovereignty_score=10,
|
||||||
|
details={"url": settings.vllm_url, "error": "Cannot connect to vLLM server"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_vllm() -> DependencyStatus:
|
||||||
|
"""Check vLLM backend status without blocking the event loop.
|
||||||
|
|
||||||
|
Results are cached for 30 seconds. vLLM is an optional backend;
|
||||||
|
unavailability triggers graceful fallback to Ollama.
|
||||||
|
"""
|
||||||
|
global _vllm_cache, _vllm_cache_ts # noqa: PLW0603
|
||||||
|
|
||||||
|
now = time.monotonic()
|
||||||
|
if _vllm_cache is not None and (now - _vllm_cache_ts) < _VLLM_CACHE_TTL:
|
||||||
|
return _vllm_cache
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await asyncio.to_thread(_check_vllm_sync)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("vLLM async check failed: %s", exc)
|
||||||
|
result = DependencyStatus(
|
||||||
|
name="vLLM",
|
||||||
|
status="unavailable",
|
||||||
|
sovereignty_score=10,
|
||||||
|
details={"url": settings.vllm_url, "error": "Cannot connect to vLLM server"},
|
||||||
|
)
|
||||||
|
|
||||||
|
_vllm_cache = result
|
||||||
|
_vllm_cache_ts = now
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _check_lightning() -> DependencyStatus:
|
def _check_lightning() -> DependencyStatus:
|
||||||
"""Check Lightning payment backend status."""
|
"""Check Lightning payment backend status."""
|
||||||
return DependencyStatus(
|
return DependencyStatus(
|
||||||
@@ -195,13 +262,22 @@ async def health_check():
|
|||||||
# Legacy format for test compatibility
|
# Legacy format for test compatibility
|
||||||
ollama_ok = await check_ollama()
|
ollama_ok = await check_ollama()
|
||||||
|
|
||||||
agent_status = "idle" if ollama_ok else "offline"
|
# Check vLLM only when it is the configured backend (avoid probing unused services)
|
||||||
|
vllm_status: str | None = None
|
||||||
|
if settings.timmy_model_backend == "vllm":
|
||||||
|
vllm_dep = await _check_vllm()
|
||||||
|
vllm_status = "up" if vllm_dep.status == "healthy" else "down"
|
||||||
|
|
||||||
|
inference_ok = vllm_status == "up" if vllm_status is not None else ollama_ok
|
||||||
|
agent_status = "idle" if inference_ok else "offline"
|
||||||
|
|
||||||
|
services: dict = {"ollama": "up" if ollama_ok else "down"}
|
||||||
|
if vllm_status is not None:
|
||||||
|
services["vllm"] = vllm_status
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "ok" if ollama_ok else "degraded",
|
"status": "ok" if inference_ok else "degraded",
|
||||||
"services": {
|
"services": services,
|
||||||
"ollama": "up" if ollama_ok else "down",
|
|
||||||
},
|
|
||||||
"agents": {
|
"agents": {
|
||||||
"agent": {"status": agent_status},
|
"agent": {"status": agent_status},
|
||||||
},
|
},
|
||||||
@@ -210,7 +286,7 @@ async def health_check():
|
|||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"uptime_seconds": uptime,
|
"uptime_seconds": uptime,
|
||||||
"llm_backend": settings.timmy_model_backend,
|
"llm_backend": settings.timmy_model_backend,
|
||||||
"llm_model": settings.ollama_model,
|
"llm_model": settings.vllm_model if settings.timmy_model_backend == "vllm" else settings.ollama_model,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -252,6 +328,9 @@ async def sovereignty_check():
|
|||||||
_check_lightning(),
|
_check_lightning(),
|
||||||
_check_sqlite(),
|
_check_sqlite(),
|
||||||
]
|
]
|
||||||
|
# Include vLLM in the audit when it is the active backend
|
||||||
|
if settings.timmy_model_backend == "vllm":
|
||||||
|
dependencies.append(await _check_vllm())
|
||||||
|
|
||||||
overall = _calculate_overall_score(dependencies)
|
overall = _calculate_overall_score(dependencies)
|
||||||
recommendations = _generate_recommendations(dependencies)
|
recommendations = _generate_recommendations(dependencies)
|
||||||
|
|||||||
@@ -331,6 +331,22 @@ class CascadeRouter:
|
|||||||
logger.debug("vllm-mlx provider check error: %s", exc)
|
logger.debug("vllm-mlx provider check error: %s", exc)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
elif provider.type == "vllm":
|
||||||
|
# Check if standard vLLM server is running (OpenAI-compatible API)
|
||||||
|
if requests is None:
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
base_url = provider.base_url or provider.url or settings.vllm_url
|
||||||
|
# Strip /v1 suffix — health endpoint is at the server root
|
||||||
|
server_root = base_url.rstrip("/")
|
||||||
|
if server_root.endswith("/v1"):
|
||||||
|
server_root = server_root[:-3]
|
||||||
|
response = requests.get(f"{server_root}/health", timeout=5)
|
||||||
|
return response.status_code == 200
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("vllm provider check error: %s", exc)
|
||||||
|
return False
|
||||||
|
|
||||||
elif provider.type in ("openai", "anthropic", "grok"):
|
elif provider.type in ("openai", "anthropic", "grok"):
|
||||||
# Check if API key is set
|
# Check if API key is set
|
||||||
return provider.api_key is not None and provider.api_key != ""
|
return provider.api_key is not None and provider.api_key != ""
|
||||||
@@ -793,6 +809,14 @@ class CascadeRouter:
|
|||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
)
|
)
|
||||||
|
elif provider.type == "vllm":
|
||||||
|
result = await self._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=messages,
|
||||||
|
model=model or provider.get_default_model(),
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown provider type: {provider.type}")
|
raise ValueError(f"Unknown provider type: {provider.type}")
|
||||||
|
|
||||||
@@ -1031,6 +1055,49 @@ class CascadeRouter:
|
|||||||
"model": response.model,
|
"model": response.model,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async def _call_vllm(
|
||||||
|
self,
|
||||||
|
provider: Provider,
|
||||||
|
messages: list[dict],
|
||||||
|
model: str,
|
||||||
|
temperature: float,
|
||||||
|
max_tokens: int | None,
|
||||||
|
) -> dict:
|
||||||
|
"""Call a standard vLLM server via its OpenAI-compatible API.
|
||||||
|
|
||||||
|
vLLM exposes the same /v1/chat/completions endpoint as OpenAI.
|
||||||
|
No API key is required for local deployments.
|
||||||
|
|
||||||
|
Default URL comes from settings.vllm_url (VLLM_URL env var).
|
||||||
|
"""
|
||||||
|
import openai
|
||||||
|
|
||||||
|
base_url = provider.base_url or provider.url or settings.vllm_url
|
||||||
|
# Ensure the base_url ends with /v1 as expected by the OpenAI client
|
||||||
|
if not base_url.rstrip("/").endswith("/v1"):
|
||||||
|
base_url = base_url.rstrip("/") + "/v1"
|
||||||
|
|
||||||
|
client = openai.AsyncOpenAI(
|
||||||
|
api_key=provider.api_key or "no-key-required",
|
||||||
|
base_url=base_url,
|
||||||
|
timeout=self.config.timeout_seconds,
|
||||||
|
)
|
||||||
|
|
||||||
|
kwargs: dict = {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"temperature": temperature,
|
||||||
|
}
|
||||||
|
if max_tokens:
|
||||||
|
kwargs["max_tokens"] = max_tokens
|
||||||
|
|
||||||
|
response = await client.chat.completions.create(**kwargs)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"content": response.choices[0].message.content,
|
||||||
|
"model": response.model,
|
||||||
|
}
|
||||||
|
|
||||||
def _record_success(self, provider: Provider, latency_ms: float) -> None:
|
def _record_success(self, provider: Provider, latency_ms: float) -> None:
|
||||||
"""Record a successful request."""
|
"""Record a successful request."""
|
||||||
provider.metrics.total_requests += 1
|
provider.metrics.total_requests += 1
|
||||||
|
|||||||
411
tests/infrastructure/test_vllm_backend.py
Normal file
411
tests/infrastructure/test_vllm_backend.py
Normal file
@@ -0,0 +1,411 @@
|
|||||||
|
"""Unit tests for the vLLM inference backend (issue #1281).
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- vllm provider type in CascadeRouter availability check
|
||||||
|
- _call_vllm method (mocked OpenAI client)
|
||||||
|
- providers.yaml loads vllm-local entry
|
||||||
|
- vLLM health check helpers in dashboard routes
|
||||||
|
- config.py has vllm backend option
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from infrastructure.router.cascade import CascadeRouter, Provider, ProviderStatus
|
||||||
|
|
||||||
|
|
||||||
|
# ── Provider availability checks ────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
class TestVllmProviderAvailability:
|
||||||
|
"""Test _check_provider_available for vllm provider type."""
|
||||||
|
|
||||||
|
def _make_vllm_provider(self, url: str = "http://localhost:8001/v1") -> Provider:
|
||||||
|
return Provider(
|
||||||
|
name="vllm-local",
|
||||||
|
type="vllm",
|
||||||
|
enabled=True,
|
||||||
|
priority=3,
|
||||||
|
base_url=url,
|
||||||
|
models=[{"name": "Qwen/Qwen2.5-14B-Instruct", "default": True}],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_available_when_health_200(self, tmp_path):
|
||||||
|
"""Provider is available when /health returns 200."""
|
||||||
|
provider = self._make_vllm_provider()
|
||||||
|
router = CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
|
||||||
|
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||||
|
mock_requests.get.return_value = mock_response
|
||||||
|
available = router._check_provider_available(provider)
|
||||||
|
|
||||||
|
assert available is True
|
||||||
|
# Verify the health endpoint was called (root, not /v1)
|
||||||
|
call_args = mock_requests.get.call_args[0][0]
|
||||||
|
assert call_args.endswith("/health")
|
||||||
|
assert "/v1" not in call_args
|
||||||
|
|
||||||
|
def test_unavailable_when_health_non_200(self, tmp_path):
|
||||||
|
"""Provider is unavailable when /health returns non-200."""
|
||||||
|
provider = self._make_vllm_provider()
|
||||||
|
router = CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.status_code = 503
|
||||||
|
|
||||||
|
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||||
|
mock_requests.get.return_value = mock_response
|
||||||
|
available = router._check_provider_available(provider)
|
||||||
|
|
||||||
|
assert available is False
|
||||||
|
|
||||||
|
def test_unavailable_on_connection_error(self, tmp_path):
|
||||||
|
"""Provider is unavailable when connection fails."""
|
||||||
|
provider = self._make_vllm_provider()
|
||||||
|
router = CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||||
|
mock_requests.get.side_effect = ConnectionError("refused")
|
||||||
|
available = router._check_provider_available(provider)
|
||||||
|
|
||||||
|
assert available is False
|
||||||
|
|
||||||
|
def test_strips_v1_suffix_for_health_check(self, tmp_path):
|
||||||
|
"""Health check URL strips /v1 before appending /health."""
|
||||||
|
provider = self._make_vllm_provider(url="http://localhost:8001/v1")
|
||||||
|
router = CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
|
||||||
|
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||||
|
mock_requests.get.return_value = mock_response
|
||||||
|
router._check_provider_available(provider)
|
||||||
|
|
||||||
|
called_url = mock_requests.get.call_args[0][0]
|
||||||
|
assert called_url == "http://localhost:8001/health"
|
||||||
|
|
||||||
|
def test_assumes_available_when_requests_none(self, tmp_path):
|
||||||
|
"""Gracefully assumes available when requests library is absent."""
|
||||||
|
provider = self._make_vllm_provider()
|
||||||
|
router = CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
with patch("infrastructure.router.cascade.requests", None):
|
||||||
|
available = router._check_provider_available(provider)
|
||||||
|
|
||||||
|
assert available is True
|
||||||
|
|
||||||
|
|
||||||
|
# ── _call_vllm method ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
class TestCallVllm:
|
||||||
|
"""Test CascadeRouter._call_vllm."""
|
||||||
|
|
||||||
|
def _make_router(self, tmp_path: Path) -> CascadeRouter:
|
||||||
|
return CascadeRouter(config_path=tmp_path / "none.yaml")
|
||||||
|
|
||||||
|
def _make_provider(self, base_url: str = "http://localhost:8001") -> Provider:
|
||||||
|
return Provider(
|
||||||
|
name="vllm-local",
|
||||||
|
type="vllm",
|
||||||
|
enabled=True,
|
||||||
|
priority=3,
|
||||||
|
base_url=base_url,
|
||||||
|
models=[{"name": "Qwen/Qwen2.5-14B-Instruct", "default": True}],
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_returns_content_and_model(self, tmp_path):
|
||||||
|
"""_call_vllm returns content and model name from API response."""
|
||||||
|
router = self._make_router(tmp_path)
|
||||||
|
provider = self._make_provider()
|
||||||
|
|
||||||
|
mock_choice = MagicMock()
|
||||||
|
mock_choice.message.content = "Hello from vLLM!"
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [mock_choice]
|
||||||
|
mock_response.model = "Qwen/Qwen2.5-14B-Instruct"
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
||||||
|
|
||||||
|
with patch("openai.AsyncOpenAI", return_value=mock_client):
|
||||||
|
result = await router._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
model="Qwen/Qwen2.5-14B-Instruct",
|
||||||
|
temperature=0.7,
|
||||||
|
max_tokens=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["content"] == "Hello from vLLM!"
|
||||||
|
assert result["model"] == "Qwen/Qwen2.5-14B-Instruct"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_appends_v1_to_base_url(self, tmp_path):
|
||||||
|
"""_call_vllm always points the OpenAI client at base_url/v1."""
|
||||||
|
router = self._make_router(tmp_path)
|
||||||
|
provider = self._make_provider(base_url="http://localhost:8001")
|
||||||
|
|
||||||
|
mock_choice = MagicMock()
|
||||||
|
mock_choice.message.content = "ok"
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [mock_choice]
|
||||||
|
mock_response.model = "model"
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
||||||
|
|
||||||
|
with patch("openai.AsyncOpenAI", return_value=mock_client) as mock_openai:
|
||||||
|
await router._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
model="model",
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=None,
|
||||||
|
)
|
||||||
|
_, kwargs = mock_openai.call_args
|
||||||
|
assert kwargs["base_url"].endswith("/v1")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_does_not_double_v1(self, tmp_path):
|
||||||
|
"""_call_vllm does not append /v1 if base_url already ends with it."""
|
||||||
|
router = self._make_router(tmp_path)
|
||||||
|
provider = self._make_provider(base_url="http://localhost:8001/v1")
|
||||||
|
|
||||||
|
mock_choice = MagicMock()
|
||||||
|
mock_choice.message.content = "ok"
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [mock_choice]
|
||||||
|
mock_response.model = "model"
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
||||||
|
|
||||||
|
with patch("openai.AsyncOpenAI", return_value=mock_client) as mock_openai:
|
||||||
|
await router._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
model="model",
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=None,
|
||||||
|
)
|
||||||
|
_, kwargs = mock_openai.call_args
|
||||||
|
assert kwargs["base_url"] == "http://localhost:8001/v1"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_max_tokens_passed_when_set(self, tmp_path):
|
||||||
|
"""max_tokens is forwarded to the API when provided."""
|
||||||
|
router = self._make_router(tmp_path)
|
||||||
|
provider = self._make_provider()
|
||||||
|
|
||||||
|
mock_choice = MagicMock()
|
||||||
|
mock_choice.message.content = "ok"
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [mock_choice]
|
||||||
|
mock_response.model = "model"
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
||||||
|
|
||||||
|
with patch("openai.AsyncOpenAI", return_value=mock_client):
|
||||||
|
await router._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
model="model",
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=256,
|
||||||
|
)
|
||||||
|
call_kwargs = mock_client.chat.completions.create.call_args[1]
|
||||||
|
assert call_kwargs.get("max_tokens") == 256
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_max_tokens_omitted_when_none(self, tmp_path):
|
||||||
|
"""max_tokens key is absent when not provided."""
|
||||||
|
router = self._make_router(tmp_path)
|
||||||
|
provider = self._make_provider()
|
||||||
|
|
||||||
|
mock_choice = MagicMock()
|
||||||
|
mock_choice.message.content = "ok"
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices = [mock_choice]
|
||||||
|
mock_response.model = "model"
|
||||||
|
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
|
||||||
|
|
||||||
|
with patch("openai.AsyncOpenAI", return_value=mock_client):
|
||||||
|
await router._call_vllm(
|
||||||
|
provider=provider,
|
||||||
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
|
model="model",
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=None,
|
||||||
|
)
|
||||||
|
call_kwargs = mock_client.chat.completions.create.call_args[1]
|
||||||
|
assert "max_tokens" not in call_kwargs
|
||||||
|
|
||||||
|
|
||||||
|
# ── providers.yaml loads vllm-local ─────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
class TestProvidersYamlVllm:
|
||||||
|
"""Verify providers.yaml contains a valid vllm-local entry."""
|
||||||
|
|
||||||
|
def test_vllm_local_entry_exists(self):
|
||||||
|
"""providers.yaml has a vllm-local provider of type vllm."""
|
||||||
|
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
|
||||||
|
assert config_path.exists(), "config/providers.yaml not found"
|
||||||
|
|
||||||
|
with config_path.open() as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
providers = config.get("providers", [])
|
||||||
|
vllm_providers = [p for p in providers if p.get("type") == "vllm"]
|
||||||
|
assert vllm_providers, "No provider with type=vllm found in providers.yaml"
|
||||||
|
|
||||||
|
vllm_local = next((p for p in vllm_providers if p["name"] == "vllm-local"), None)
|
||||||
|
assert vllm_local is not None, "vllm-local provider not found in providers.yaml"
|
||||||
|
|
||||||
|
def test_vllm_local_disabled_by_default(self):
|
||||||
|
"""vllm-local is disabled by default so the router stays on Ollama."""
|
||||||
|
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
|
||||||
|
with config_path.open() as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
providers = config.get("providers", [])
|
||||||
|
vllm_local = next((p for p in providers if p.get("name") == "vllm-local"), None)
|
||||||
|
assert vllm_local is not None
|
||||||
|
assert vllm_local.get("enabled") is False, "vllm-local should be disabled by default"
|
||||||
|
|
||||||
|
def test_vllm_local_has_default_model(self):
|
||||||
|
"""vllm-local has at least one model with a context window."""
|
||||||
|
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
|
||||||
|
with config_path.open() as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
providers = config.get("providers", [])
|
||||||
|
vllm_local = next((p for p in providers if p.get("name") == "vllm-local"), None)
|
||||||
|
assert vllm_local is not None
|
||||||
|
|
||||||
|
models = vllm_local.get("models", [])
|
||||||
|
assert models, "vllm-local must declare at least one model"
|
||||||
|
default_models = [m for m in models if m.get("default")]
|
||||||
|
assert default_models, "vllm-local must have a model marked default: true"
|
||||||
|
|
||||||
|
|
||||||
|
# ── config.py backend option ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
class TestConfigVllmBackend:
|
||||||
|
"""Verify config.py exposes the vllm backend option."""
|
||||||
|
|
||||||
|
def test_vllm_is_valid_backend(self):
|
||||||
|
"""timmy_model_backend accepts 'vllm' without validation errors."""
|
||||||
|
from config import Settings
|
||||||
|
|
||||||
|
s = Settings(timmy_model_backend="vllm")
|
||||||
|
assert s.timmy_model_backend == "vllm"
|
||||||
|
|
||||||
|
def test_vllm_url_default(self):
|
||||||
|
"""vllm_url has a sensible default."""
|
||||||
|
from config import Settings
|
||||||
|
|
||||||
|
s = Settings()
|
||||||
|
assert s.vllm_url.startswith("http://")
|
||||||
|
|
||||||
|
def test_vllm_model_default(self):
|
||||||
|
"""vllm_model has a sensible default."""
|
||||||
|
from config import Settings
|
||||||
|
|
||||||
|
s = Settings()
|
||||||
|
assert s.vllm_model # non-empty string
|
||||||
|
|
||||||
|
|
||||||
|
# ── Health check helpers ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
class TestVllmHealthCheck:
|
||||||
|
"""Test _check_vllm_sync and _check_vllm."""
|
||||||
|
|
||||||
|
def test_sync_returns_healthy_on_200(self):
|
||||||
|
"""_check_vllm_sync returns 'healthy' when server responds 200."""
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
from dashboard.routes.health import _check_vllm_sync
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.status = 200
|
||||||
|
mock_response.__enter__ = lambda s: s
|
||||||
|
mock_response.__exit__ = MagicMock(return_value=False)
|
||||||
|
|
||||||
|
with patch.object(urllib.request, "urlopen", return_value=mock_response):
|
||||||
|
result = _check_vllm_sync()
|
||||||
|
|
||||||
|
assert result.status == "healthy"
|
||||||
|
assert result.name == "vLLM"
|
||||||
|
|
||||||
|
def test_sync_returns_unavailable_on_connection_error(self):
|
||||||
|
"""_check_vllm_sync returns 'unavailable' when server is unreachable."""
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
from dashboard.routes.health import _check_vllm_sync
|
||||||
|
|
||||||
|
with patch.object(urllib.request, "urlopen", side_effect=urllib.error.URLError("refused")):
|
||||||
|
result = _check_vllm_sync()
|
||||||
|
|
||||||
|
assert result.status == "unavailable"
|
||||||
|
assert result.name == "vLLM"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_caches_result(self):
|
||||||
|
"""_check_vllm caches the result for _VLLM_CACHE_TTL seconds."""
|
||||||
|
import dashboard.routes.health as health_module
|
||||||
|
from dashboard.routes.health import _check_vllm
|
||||||
|
|
||||||
|
# Reset cache
|
||||||
|
health_module._vllm_cache = None
|
||||||
|
health_module._vllm_cache_ts = 0.0
|
||||||
|
|
||||||
|
mock_dep = MagicMock()
|
||||||
|
mock_dep.status = "healthy"
|
||||||
|
|
||||||
|
with patch("dashboard.routes.health._check_vllm_sync", return_value=mock_dep):
|
||||||
|
result1 = await _check_vllm()
|
||||||
|
result2 = await _check_vllm() # should hit cache
|
||||||
|
|
||||||
|
assert result1 is result2 # same object returned from cache
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_refreshes_after_ttl(self):
|
||||||
|
"""_check_vllm refreshes the cache after the TTL expires."""
|
||||||
|
import dashboard.routes.health as health_module
|
||||||
|
from dashboard.routes.health import _VLLM_CACHE_TTL, _check_vllm
|
||||||
|
|
||||||
|
# Expire the cache
|
||||||
|
health_module._vllm_cache = None
|
||||||
|
health_module._vllm_cache_ts = time.monotonic() - _VLLM_CACHE_TTL - 1
|
||||||
|
|
||||||
|
mock_dep = MagicMock()
|
||||||
|
mock_dep.status = "unavailable"
|
||||||
|
|
||||||
|
with patch("dashboard.routes.health._check_vllm_sync", return_value=mock_dep) as mock_fn:
|
||||||
|
await _check_vllm()
|
||||||
|
|
||||||
|
mock_fn.assert_called_once()
|
||||||
Reference in New Issue
Block a user