Compare commits

...

3 Commits

Author SHA1 Message Date
Alexander Whitestone
28d1905df4 feat: add vLLM as alternative inference backend (#1281)
Some checks failed
Tests / lint (pull_request) Failing after 31s
Tests / test (pull_request) Has been skipped
Adds vLLM (high-throughput OpenAI-compatible inference server) as a
selectable backend alongside the existing Ollama and vllm-mlx backends.
vLLM's continuous batching gives 3-10x throughput for agentic workloads.

Changes:
- config.py: add `vllm` to timmy_model_backend Literal; add vllm_url /
  vllm_model settings (VLLM_URL / VLLM_MODEL env vars)
- cascade.py: add vllm provider type with _check_provider_available
  (hits /health) and _call_vllm (OpenAI-compatible completions)
- providers.yaml: add disabled-by-default vllm-local provider (priority 3,
  port 8001); bump OpenAI/Anthropic backup priorities to 4/5
- health.py: add _check_vllm/_check_vllm_sync with 30-second TTL cache;
  /health and /health/sovereignty reflect vLLM status when it is the
  active backend
- docker-compose.yml: add vllm service behind 'vllm' profile (GPU
  passthrough commented-out template included); add vllm-cache volume
- CLAUDE.md: add vLLM row to Service Fallback Matrix
- tests: 26 new unit tests covering availability checks, _call_vllm,
  providers.yaml validation, config options, and health helpers

Graceful fallback: if vLLM is unavailable the cascade router automatically
falls back to Ollama. The app never crashes.

Fixes #1281

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 21:52:52 -04:00
6c76bf2f66 [claude] Integrate health snapshot into Daily Run pre-flight (#923) (#1280)
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
2026-03-24 01:43:49 +00:00
0436dfd4c4 [claude] Dashboard: Agent Scorecards panel in Mission Control (#929) (#1276)
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
2026-03-24 01:43:21 +00:00
10 changed files with 1054 additions and 13 deletions

View File

@@ -150,6 +150,7 @@ async def transcribe_audio(audio: bytes) -> str:
| Service | When Unavailable | Fallback Behavior |
|---------|------------------|-------------------|
| Ollama | No local LLM | Claude backend (if ANTHROPIC_API_KEY set) |
| vLLM | Server not running | Ollama backend (cascade router fallback) |
| Redis | Cache/storage down | In-memory dict (ephemeral) |
| AirLLM | Import error or no Apple Silicon | Ollama backend |
| Voice (Piper) | Service down | Browser Web Speech API |

View File

@@ -131,11 +131,34 @@ providers:
context_window: 32000
capabilities: [text, tools, json, streaming]
# Tertiary: OpenAI (if API key available)
# Tertiary: vLLM (OpenAI-compatible, continuous batching, 3-10x agentic throughput)
# Runs on CUDA GPU or CPU. On Apple Silicon, prefer vllm-mlx-local (above).
# To enable: start vLLM server:
# python -m vllm.entrypoints.openai.api_server \
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
# Then set enabled: true (or TIMMY_LLM_BACKEND=vllm + VLLM_URL=http://localhost:8001)
- name: vllm-local
type: vllm
enabled: false # Enable when vLLM server is running
priority: 3
tier: local
base_url: "http://localhost:8001/v1"
models:
- name: Qwen/Qwen2.5-14B-Instruct
default: true
context_window: 32000
capabilities: [text, tools, json, streaming, complex]
description: "Qwen2.5-14B on vLLM — continuous batching for agentic workloads"
- name: Qwen/Qwen2.5-7B-Instruct
context_window: 32000
capabilities: [text, tools, json, streaming, routine]
description: "Qwen2.5-7B on vLLM — fast model for routine tasks"
# Quinary: OpenAI (if API key available)
- name: openai-backup
type: openai
enabled: false # Enable by setting OPENAI_API_KEY
priority: 3
priority: 4
tier: standard_cloud
api_key: "${OPENAI_API_KEY}" # Loaded from environment
base_url: null # Use default OpenAI endpoint
@@ -147,12 +170,12 @@ providers:
- name: gpt-4o
context_window: 128000
capabilities: [text, vision, tools, json, streaming]
# Quaternary: Anthropic (if API key available)
# Senary: Anthropic (if API key available)
- name: anthropic-backup
type: anthropic
enabled: false # Enable by setting ANTHROPIC_API_KEY
priority: 4
priority: 5
tier: frontier
api_key: "${ANTHROPIC_API_KEY}"
models:

View File

@@ -42,6 +42,10 @@ services:
GROK_ENABLED: "${GROK_ENABLED:-false}"
XAI_API_KEY: "${XAI_API_KEY:-}"
GROK_DEFAULT_MODEL: "${GROK_DEFAULT_MODEL:-grok-3-fast}"
# vLLM backend — set TIMMY_LLM_BACKEND=vllm to activate
TIMMY_LLM_BACKEND: "${TIMMY_LLM_BACKEND:-ollama}"
VLLM_URL: "${VLLM_URL:-http://localhost:8001}"
VLLM_MODEL: "${VLLM_MODEL:-Qwen/Qwen2.5-14B-Instruct}"
extra_hosts:
- "host.docker.internal:host-gateway" # Linux: maps to host IP
networks:
@@ -74,6 +78,49 @@ services:
profiles:
- celery
# ── vLLM — high-throughput inference server (GPU optional) ──────────────
# Requires the 'vllm' profile: docker compose --profile vllm up
#
# GPU (NVIDIA): set VLLM_MODEL and ensure nvidia-container-toolkit is installed.
# CPU-only: add --device cpu to VLLM_EXTRA_ARGS (slower, but works anywhere).
#
# The dashboard reaches vLLM at http://vllm:8001 (inside timmy-net).
# Set VLLM_URL=http://vllm:8001 in the dashboard environment when using this service.
vllm:
image: vllm/vllm-openai:latest
container_name: timmy-vllm
profiles:
- vllm
ports:
- "8001:8001"
environment:
# Model to load — override with VLLM_MODEL env var
VLLM_MODEL: "${VLLM_MODEL:-Qwen/Qwen2.5-7B-Instruct}"
command: >
--model ${VLLM_MODEL:-Qwen/Qwen2.5-7B-Instruct}
--port 8001
--host 0.0.0.0
${VLLM_EXTRA_ARGS:-}
volumes:
- vllm-cache:/root/.cache/huggingface
networks:
- timmy-net
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s
# GPU support — uncomment to enable NVIDIA GPU passthrough
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
# ── OpenFang — vendored agent runtime sidecar ────────────────────────────
openfang:
build:
@@ -110,6 +157,8 @@ volumes:
device: "${PWD}/data"
openfang-data:
driver: local
vllm-cache:
driver: local
# ── Internal network ────────────────────────────────────────────────────────
networks:

View File

@@ -94,8 +94,18 @@ class Settings(BaseSettings):
# ── Backend selection ────────────────────────────────────────────────────
# "ollama" — always use Ollama (default, safe everywhere)
# "vllm" — use vLLM inference server (OpenAI-compatible, faster throughput)
# "auto" — pick best available local backend, fall back to Ollama
timmy_model_backend: Literal["ollama", "grok", "claude", "auto"] = "ollama"
timmy_model_backend: Literal["ollama", "vllm", "grok", "claude", "auto"] = "ollama"
# ── vLLM backend ──────────────────────────────────────────────────────────
# vLLM is an OpenAI-compatible inference server optimised for continuous
# batching — 310x higher throughput than Ollama for agentic workloads.
# Start server: python -m vllm.entrypoints.openai.api_server \
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
# Then set TIMMY_LLM_BACKEND=vllm (or enable vllm-local in providers.yaml)
vllm_url: str = "http://localhost:8001"
vllm_model: str = "Qwen/Qwen2.5-14B-Instruct"
# ── Grok (xAI) — opt-in premium cloud backend ────────────────────────
# Grok is a premium augmentation layer — local-first ethos preserved.

View File

@@ -124,6 +124,73 @@ async def check_ollama() -> bool:
return dep.status == "healthy"
# vLLM health cache (30-second TTL)
_vllm_cache: DependencyStatus | None = None
_vllm_cache_ts: float = 0.0
_VLLM_CACHE_TTL = 30.0
def _check_vllm_sync() -> DependencyStatus:
"""Synchronous vLLM check — run via asyncio.to_thread()."""
try:
import urllib.request
base_url = settings.vllm_url.rstrip("/")
# vLLM exposes /health at the server root (strip /v1 if present)
if base_url.endswith("/v1"):
base_url = base_url[:-3]
req = urllib.request.Request(
f"{base_url}/health",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=2) as response:
if response.status == 200:
return DependencyStatus(
name="vLLM",
status="healthy",
sovereignty_score=10,
details={"url": settings.vllm_url, "model": settings.vllm_model},
)
except Exception as exc:
logger.debug("vLLM health check failed: %s", exc)
return DependencyStatus(
name="vLLM",
status="unavailable",
sovereignty_score=10,
details={"url": settings.vllm_url, "error": "Cannot connect to vLLM server"},
)
async def _check_vllm() -> DependencyStatus:
"""Check vLLM backend status without blocking the event loop.
Results are cached for 30 seconds. vLLM is an optional backend;
unavailability triggers graceful fallback to Ollama.
"""
global _vllm_cache, _vllm_cache_ts # noqa: PLW0603
now = time.monotonic()
if _vllm_cache is not None and (now - _vllm_cache_ts) < _VLLM_CACHE_TTL:
return _vllm_cache
try:
result = await asyncio.to_thread(_check_vllm_sync)
except Exception as exc:
logger.debug("vLLM async check failed: %s", exc)
result = DependencyStatus(
name="vLLM",
status="unavailable",
sovereignty_score=10,
details={"url": settings.vllm_url, "error": "Cannot connect to vLLM server"},
)
_vllm_cache = result
_vllm_cache_ts = now
return result
def _check_lightning() -> DependencyStatus:
"""Check Lightning payment backend status."""
return DependencyStatus(
@@ -195,13 +262,22 @@ async def health_check():
# Legacy format for test compatibility
ollama_ok = await check_ollama()
agent_status = "idle" if ollama_ok else "offline"
# Check vLLM only when it is the configured backend (avoid probing unused services)
vllm_status: str | None = None
if settings.timmy_model_backend == "vllm":
vllm_dep = await _check_vllm()
vllm_status = "up" if vllm_dep.status == "healthy" else "down"
inference_ok = vllm_status == "up" if vllm_status is not None else ollama_ok
agent_status = "idle" if inference_ok else "offline"
services: dict = {"ollama": "up" if ollama_ok else "down"}
if vllm_status is not None:
services["vllm"] = vllm_status
return {
"status": "ok" if ollama_ok else "degraded",
"services": {
"ollama": "up" if ollama_ok else "down",
},
"status": "ok" if inference_ok else "degraded",
"services": services,
"agents": {
"agent": {"status": agent_status},
},
@@ -210,7 +286,7 @@ async def health_check():
"version": "2.0.0",
"uptime_seconds": uptime,
"llm_backend": settings.timmy_model_backend,
"llm_model": settings.ollama_model,
"llm_model": settings.vllm_model if settings.timmy_model_backend == "vllm" else settings.ollama_model,
}
@@ -252,6 +328,9 @@ async def sovereignty_check():
_check_lightning(),
_check_sqlite(),
]
# Include vLLM in the audit when it is the active backend
if settings.timmy_model_backend == "vllm":
dependencies.append(await _check_vllm())
overall = _calculate_overall_score(dependencies)
recommendations = _generate_recommendations(dependencies)

View File

@@ -186,6 +186,24 @@
<p class="chat-history-placeholder">Loading sovereignty metrics...</p>
{% endcall %}
<!-- Agent Scorecards -->
<div class="card mc-card-spaced" id="mc-scorecards-card">
<div class="card-header">
<h2 class="card-title">Agent Scorecards</h2>
<div class="d-flex align-items-center gap-2">
<select id="mc-scorecard-period" class="form-select form-select-sm" style="width: auto;"
onchange="loadMcScorecards()">
<option value="daily" selected>Daily</option>
<option value="weekly">Weekly</option>
</select>
<a href="/scorecards" class="btn btn-sm btn-outline-secondary">Full View</a>
</div>
</div>
<div id="mc-scorecards-content" class="p-2">
<p class="chat-history-placeholder">Loading scorecards...</p>
</div>
</div>
<!-- Chat History -->
<div class="card mc-card-spaced">
<div class="card-header">
@@ -502,6 +520,20 @@ async function loadSparkStatus() {
}
}
// Load agent scorecards
async function loadMcScorecards() {
var period = document.getElementById('mc-scorecard-period').value;
var container = document.getElementById('mc-scorecards-content');
container.innerHTML = '<p class="chat-history-placeholder">Loading scorecards...</p>';
try {
var response = await fetch('/scorecards/all/panels?period=' + period);
var html = await response.text();
container.innerHTML = html;
} catch (error) {
container.innerHTML = '<p class="chat-history-placeholder">Scorecards unavailable</p>';
}
}
// Initial load
loadSparkStatus();
loadSovereignty();
@@ -510,6 +542,7 @@ loadSwarmStats();
loadLightningStats();
loadGrokStats();
loadChatHistory();
loadMcScorecards();
// Periodic updates
setInterval(loadSovereignty, 30000);
@@ -518,5 +551,6 @@ setInterval(loadSwarmStats, 5000);
setInterval(updateHeartbeat, 5000);
setInterval(loadGrokStats, 10000);
setInterval(loadSparkStatus, 15000);
setInterval(loadMcScorecards, 300000);
</script>
{% endblock %}

View File

@@ -331,6 +331,22 @@ class CascadeRouter:
logger.debug("vllm-mlx provider check error: %s", exc)
return False
elif provider.type == "vllm":
# Check if standard vLLM server is running (OpenAI-compatible API)
if requests is None:
return True
try:
base_url = provider.base_url or provider.url or settings.vllm_url
# Strip /v1 suffix — health endpoint is at the server root
server_root = base_url.rstrip("/")
if server_root.endswith("/v1"):
server_root = server_root[:-3]
response = requests.get(f"{server_root}/health", timeout=5)
return response.status_code == 200
except Exception as exc:
logger.debug("vllm provider check error: %s", exc)
return False
elif provider.type in ("openai", "anthropic", "grok"):
# Check if API key is set
return provider.api_key is not None and provider.api_key != ""
@@ -793,6 +809,14 @@ class CascadeRouter:
temperature=temperature,
max_tokens=max_tokens,
)
elif provider.type == "vllm":
result = await self._call_vllm(
provider=provider,
messages=messages,
model=model or provider.get_default_model(),
temperature=temperature,
max_tokens=max_tokens,
)
else:
raise ValueError(f"Unknown provider type: {provider.type}")
@@ -1031,6 +1055,49 @@ class CascadeRouter:
"model": response.model,
}
async def _call_vllm(
self,
provider: Provider,
messages: list[dict],
model: str,
temperature: float,
max_tokens: int | None,
) -> dict:
"""Call a standard vLLM server via its OpenAI-compatible API.
vLLM exposes the same /v1/chat/completions endpoint as OpenAI.
No API key is required for local deployments.
Default URL comes from settings.vllm_url (VLLM_URL env var).
"""
import openai
base_url = provider.base_url or provider.url or settings.vllm_url
# Ensure the base_url ends with /v1 as expected by the OpenAI client
if not base_url.rstrip("/").endswith("/v1"):
base_url = base_url.rstrip("/") + "/v1"
client = openai.AsyncOpenAI(
api_key=provider.api_key or "no-key-required",
base_url=base_url,
timeout=self.config.timeout_seconds,
)
kwargs: dict = {
"model": model,
"messages": messages,
"temperature": temperature,
}
if max_tokens:
kwargs["max_tokens"] = max_tokens
response = await client.chat.completions.create(**kwargs)
return {
"content": response.choices[0].message.content,
"model": response.model,
}
def _record_success(self, provider: Provider, latency_ms: float) -> None:
"""Record a successful request."""
provider.metrics.total_requests += 1

View File

@@ -0,0 +1,411 @@
"""Unit tests for the vLLM inference backend (issue #1281).
Covers:
- vllm provider type in CascadeRouter availability check
- _call_vllm method (mocked OpenAI client)
- providers.yaml loads vllm-local entry
- vLLM health check helpers in dashboard routes
- config.py has vllm backend option
"""
from __future__ import annotations
import time
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import yaml
from infrastructure.router.cascade import CascadeRouter, Provider, ProviderStatus
# ── Provider availability checks ────────────────────────────────────────────
@pytest.mark.unit
class TestVllmProviderAvailability:
"""Test _check_provider_available for vllm provider type."""
def _make_vllm_provider(self, url: str = "http://localhost:8001/v1") -> Provider:
return Provider(
name="vllm-local",
type="vllm",
enabled=True,
priority=3,
base_url=url,
models=[{"name": "Qwen/Qwen2.5-14B-Instruct", "default": True}],
)
def test_available_when_health_200(self, tmp_path):
"""Provider is available when /health returns 200."""
provider = self._make_vllm_provider()
router = CascadeRouter(config_path=tmp_path / "none.yaml")
mock_response = MagicMock()
mock_response.status_code = 200
with patch("infrastructure.router.cascade.requests") as mock_requests:
mock_requests.get.return_value = mock_response
available = router._check_provider_available(provider)
assert available is True
# Verify the health endpoint was called (root, not /v1)
call_args = mock_requests.get.call_args[0][0]
assert call_args.endswith("/health")
assert "/v1" not in call_args
def test_unavailable_when_health_non_200(self, tmp_path):
"""Provider is unavailable when /health returns non-200."""
provider = self._make_vllm_provider()
router = CascadeRouter(config_path=tmp_path / "none.yaml")
mock_response = MagicMock()
mock_response.status_code = 503
with patch("infrastructure.router.cascade.requests") as mock_requests:
mock_requests.get.return_value = mock_response
available = router._check_provider_available(provider)
assert available is False
def test_unavailable_on_connection_error(self, tmp_path):
"""Provider is unavailable when connection fails."""
provider = self._make_vllm_provider()
router = CascadeRouter(config_path=tmp_path / "none.yaml")
with patch("infrastructure.router.cascade.requests") as mock_requests:
mock_requests.get.side_effect = ConnectionError("refused")
available = router._check_provider_available(provider)
assert available is False
def test_strips_v1_suffix_for_health_check(self, tmp_path):
"""Health check URL strips /v1 before appending /health."""
provider = self._make_vllm_provider(url="http://localhost:8001/v1")
router = CascadeRouter(config_path=tmp_path / "none.yaml")
mock_response = MagicMock()
mock_response.status_code = 200
with patch("infrastructure.router.cascade.requests") as mock_requests:
mock_requests.get.return_value = mock_response
router._check_provider_available(provider)
called_url = mock_requests.get.call_args[0][0]
assert called_url == "http://localhost:8001/health"
def test_assumes_available_when_requests_none(self, tmp_path):
"""Gracefully assumes available when requests library is absent."""
provider = self._make_vllm_provider()
router = CascadeRouter(config_path=tmp_path / "none.yaml")
with patch("infrastructure.router.cascade.requests", None):
available = router._check_provider_available(provider)
assert available is True
# ── _call_vllm method ────────────────────────────────────────────────────────
@pytest.mark.unit
class TestCallVllm:
"""Test CascadeRouter._call_vllm."""
def _make_router(self, tmp_path: Path) -> CascadeRouter:
return CascadeRouter(config_path=tmp_path / "none.yaml")
def _make_provider(self, base_url: str = "http://localhost:8001") -> Provider:
return Provider(
name="vllm-local",
type="vllm",
enabled=True,
priority=3,
base_url=base_url,
models=[{"name": "Qwen/Qwen2.5-14B-Instruct", "default": True}],
)
@pytest.mark.asyncio
async def test_returns_content_and_model(self, tmp_path):
"""_call_vllm returns content and model name from API response."""
router = self._make_router(tmp_path)
provider = self._make_provider()
mock_choice = MagicMock()
mock_choice.message.content = "Hello from vLLM!"
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.model = "Qwen/Qwen2.5-14B-Instruct"
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
with patch("openai.AsyncOpenAI", return_value=mock_client):
result = await router._call_vllm(
provider=provider,
messages=[{"role": "user", "content": "hi"}],
model="Qwen/Qwen2.5-14B-Instruct",
temperature=0.7,
max_tokens=None,
)
assert result["content"] == "Hello from vLLM!"
assert result["model"] == "Qwen/Qwen2.5-14B-Instruct"
@pytest.mark.asyncio
async def test_appends_v1_to_base_url(self, tmp_path):
"""_call_vllm always points the OpenAI client at base_url/v1."""
router = self._make_router(tmp_path)
provider = self._make_provider(base_url="http://localhost:8001")
mock_choice = MagicMock()
mock_choice.message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.model = "model"
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
with patch("openai.AsyncOpenAI", return_value=mock_client) as mock_openai:
await router._call_vllm(
provider=provider,
messages=[{"role": "user", "content": "hi"}],
model="model",
temperature=0.0,
max_tokens=None,
)
_, kwargs = mock_openai.call_args
assert kwargs["base_url"].endswith("/v1")
@pytest.mark.asyncio
async def test_does_not_double_v1(self, tmp_path):
"""_call_vllm does not append /v1 if base_url already ends with it."""
router = self._make_router(tmp_path)
provider = self._make_provider(base_url="http://localhost:8001/v1")
mock_choice = MagicMock()
mock_choice.message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.model = "model"
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
with patch("openai.AsyncOpenAI", return_value=mock_client) as mock_openai:
await router._call_vllm(
provider=provider,
messages=[{"role": "user", "content": "hi"}],
model="model",
temperature=0.0,
max_tokens=None,
)
_, kwargs = mock_openai.call_args
assert kwargs["base_url"] == "http://localhost:8001/v1"
@pytest.mark.asyncio
async def test_max_tokens_passed_when_set(self, tmp_path):
"""max_tokens is forwarded to the API when provided."""
router = self._make_router(tmp_path)
provider = self._make_provider()
mock_choice = MagicMock()
mock_choice.message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.model = "model"
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
with patch("openai.AsyncOpenAI", return_value=mock_client):
await router._call_vllm(
provider=provider,
messages=[{"role": "user", "content": "hi"}],
model="model",
temperature=0.0,
max_tokens=256,
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert call_kwargs.get("max_tokens") == 256
@pytest.mark.asyncio
async def test_max_tokens_omitted_when_none(self, tmp_path):
"""max_tokens key is absent when not provided."""
router = self._make_router(tmp_path)
provider = self._make_provider()
mock_choice = MagicMock()
mock_choice.message.content = "ok"
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.model = "model"
mock_client = AsyncMock()
mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
with patch("openai.AsyncOpenAI", return_value=mock_client):
await router._call_vllm(
provider=provider,
messages=[{"role": "user", "content": "hi"}],
model="model",
temperature=0.0,
max_tokens=None,
)
call_kwargs = mock_client.chat.completions.create.call_args[1]
assert "max_tokens" not in call_kwargs
# ── providers.yaml loads vllm-local ─────────────────────────────────────────
@pytest.mark.unit
class TestProvidersYamlVllm:
"""Verify providers.yaml contains a valid vllm-local entry."""
def test_vllm_local_entry_exists(self):
"""providers.yaml has a vllm-local provider of type vllm."""
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
assert config_path.exists(), "config/providers.yaml not found"
with config_path.open() as f:
config = yaml.safe_load(f)
providers = config.get("providers", [])
vllm_providers = [p for p in providers if p.get("type") == "vllm"]
assert vllm_providers, "No provider with type=vllm found in providers.yaml"
vllm_local = next((p for p in vllm_providers if p["name"] == "vllm-local"), None)
assert vllm_local is not None, "vllm-local provider not found in providers.yaml"
def test_vllm_local_disabled_by_default(self):
"""vllm-local is disabled by default so the router stays on Ollama."""
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
with config_path.open() as f:
config = yaml.safe_load(f)
providers = config.get("providers", [])
vllm_local = next((p for p in providers if p.get("name") == "vllm-local"), None)
assert vllm_local is not None
assert vllm_local.get("enabled") is False, "vllm-local should be disabled by default"
def test_vllm_local_has_default_model(self):
"""vllm-local has at least one model with a context window."""
config_path = Path(__file__).parents[2] / "config" / "providers.yaml"
with config_path.open() as f:
config = yaml.safe_load(f)
providers = config.get("providers", [])
vllm_local = next((p for p in providers if p.get("name") == "vllm-local"), None)
assert vllm_local is not None
models = vllm_local.get("models", [])
assert models, "vllm-local must declare at least one model"
default_models = [m for m in models if m.get("default")]
assert default_models, "vllm-local must have a model marked default: true"
# ── config.py backend option ─────────────────────────────────────────────────
@pytest.mark.unit
class TestConfigVllmBackend:
"""Verify config.py exposes the vllm backend option."""
def test_vllm_is_valid_backend(self):
"""timmy_model_backend accepts 'vllm' without validation errors."""
from config import Settings
s = Settings(timmy_model_backend="vllm")
assert s.timmy_model_backend == "vllm"
def test_vllm_url_default(self):
"""vllm_url has a sensible default."""
from config import Settings
s = Settings()
assert s.vllm_url.startswith("http://")
def test_vllm_model_default(self):
"""vllm_model has a sensible default."""
from config import Settings
s = Settings()
assert s.vllm_model # non-empty string
# ── Health check helpers ─────────────────────────────────────────────────────
@pytest.mark.unit
class TestVllmHealthCheck:
"""Test _check_vllm_sync and _check_vllm."""
def test_sync_returns_healthy_on_200(self):
"""_check_vllm_sync returns 'healthy' when server responds 200."""
import urllib.request
from dashboard.routes.health import _check_vllm_sync
mock_response = MagicMock()
mock_response.status = 200
mock_response.__enter__ = lambda s: s
mock_response.__exit__ = MagicMock(return_value=False)
with patch.object(urllib.request, "urlopen", return_value=mock_response):
result = _check_vllm_sync()
assert result.status == "healthy"
assert result.name == "vLLM"
def test_sync_returns_unavailable_on_connection_error(self):
"""_check_vllm_sync returns 'unavailable' when server is unreachable."""
import urllib.error
import urllib.request
from dashboard.routes.health import _check_vllm_sync
with patch.object(urllib.request, "urlopen", side_effect=urllib.error.URLError("refused")):
result = _check_vllm_sync()
assert result.status == "unavailable"
assert result.name == "vLLM"
@pytest.mark.asyncio
async def test_async_caches_result(self):
"""_check_vllm caches the result for _VLLM_CACHE_TTL seconds."""
import dashboard.routes.health as health_module
from dashboard.routes.health import _check_vllm
# Reset cache
health_module._vllm_cache = None
health_module._vllm_cache_ts = 0.0
mock_dep = MagicMock()
mock_dep.status = "healthy"
with patch("dashboard.routes.health._check_vllm_sync", return_value=mock_dep):
result1 = await _check_vllm()
result2 = await _check_vllm() # should hit cache
assert result1 is result2 # same object returned from cache
@pytest.mark.asyncio
async def test_async_refreshes_after_ttl(self):
"""_check_vllm refreshes the cache after the TTL expires."""
import dashboard.routes.health as health_module
from dashboard.routes.health import _VLLM_CACHE_TTL, _check_vllm
# Expire the cache
health_module._vllm_cache = None
health_module._vllm_cache_ts = time.monotonic() - _VLLM_CACHE_TTL - 1
mock_dep = MagicMock()
mock_dep.status = "unavailable"
with patch("dashboard.routes.health._check_vllm_sync", return_value=mock_dep) as mock_fn:
await _check_vllm()
mock_fn.assert_called_once()

View File

@@ -0,0 +1,270 @@
"""Tests for Daily Run orchestrator — health snapshot integration.
Verifies that the orchestrator runs a pre-flight health snapshot before
any coding work begins, and aborts on red status unless --force is passed.
Refs: #923
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
# Add timmy_automations to path for imports
_TA_PATH = Path(__file__).resolve().parent.parent.parent / "timmy_automations" / "daily_run"
if str(_TA_PATH) not in sys.path:
sys.path.insert(0, str(_TA_PATH))
# Also add utils path
_TA_UTILS = Path(__file__).resolve().parent.parent.parent / "timmy_automations"
if str(_TA_UTILS) not in sys.path:
sys.path.insert(0, str(_TA_UTILS))
import health_snapshot as hs
import orchestrator as orch
def _make_snapshot(overall_status: str) -> hs.HealthSnapshot:
"""Build a minimal HealthSnapshot for testing."""
return hs.HealthSnapshot(
timestamp="2026-01-01T00:00:00+00:00",
overall_status=overall_status,
ci=hs.CISignal(status="pass", message="CI passing"),
issues=hs.IssueSignal(count=0, p0_count=0, p1_count=0),
flakiness=hs.FlakinessSignal(
status="healthy",
recent_failures=0,
recent_cycles=10,
failure_rate=0.0,
message="All good",
),
tokens=hs.TokenEconomySignal(status="balanced", message="Balanced"),
)
def _make_red_snapshot() -> hs.HealthSnapshot:
return hs.HealthSnapshot(
timestamp="2026-01-01T00:00:00+00:00",
overall_status="red",
ci=hs.CISignal(status="fail", message="CI failed"),
issues=hs.IssueSignal(count=1, p0_count=1, p1_count=0),
flakiness=hs.FlakinessSignal(
status="critical",
recent_failures=8,
recent_cycles=10,
failure_rate=0.8,
message="High flakiness",
),
tokens=hs.TokenEconomySignal(status="unknown", message="No data"),
)
def _default_args(**overrides) -> argparse.Namespace:
"""Build an argparse Namespace with defaults matching the orchestrator flags."""
defaults = {
"review": False,
"json": False,
"max_items": None,
"skip_health_check": False,
"force": False,
}
defaults.update(overrides)
return argparse.Namespace(**defaults)
class TestRunHealthSnapshot:
"""Test run_health_snapshot() — the pre-flight check called by main()."""
def test_green_returns_zero(self, capsys):
"""Green snapshot returns 0 (proceed)."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", return_value=_make_snapshot("green")):
rc = orch.run_health_snapshot(args)
assert rc == 0
def test_yellow_returns_zero(self, capsys):
"""Yellow snapshot returns 0 (proceed with caution)."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", return_value=_make_snapshot("yellow")):
rc = orch.run_health_snapshot(args)
assert rc == 0
def test_red_returns_one(self, capsys):
"""Red snapshot returns 1 (abort)."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()):
rc = orch.run_health_snapshot(args)
assert rc == 1
def test_red_with_force_returns_zero(self, capsys):
"""Red snapshot with --force returns 0 (proceed anyway)."""
args = _default_args(force=True)
with patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()):
rc = orch.run_health_snapshot(args)
assert rc == 0
def test_snapshot_exception_is_skipped(self, capsys):
"""If health snapshot raises, it degrades gracefully and returns 0."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", side_effect=RuntimeError("boom")):
rc = orch.run_health_snapshot(args)
assert rc == 0
captured = capsys.readouterr()
assert "warning" in captured.err.lower() or "skipping" in captured.err.lower()
def test_snapshot_prints_summary(self, capsys):
"""Health snapshot prints a pre-flight summary block."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", return_value=_make_snapshot("green")):
orch.run_health_snapshot(args)
captured = capsys.readouterr()
assert "PRE-FLIGHT HEALTH CHECK" in captured.out
assert "CI" in captured.out
def test_red_prints_abort_message(self, capsys):
"""Red snapshot prints an abort message to stderr."""
args = _default_args()
with patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()):
orch.run_health_snapshot(args)
captured = capsys.readouterr()
assert "RED" in captured.err or "aborting" in captured.err.lower()
def test_p0_issues_shown_in_output(self, capsys):
"""P0 issue count is shown in the pre-flight output."""
args = _default_args()
snapshot = hs.HealthSnapshot(
timestamp="2026-01-01T00:00:00+00:00",
overall_status="red",
ci=hs.CISignal(status="pass", message="CI passing"),
issues=hs.IssueSignal(count=2, p0_count=2, p1_count=0),
flakiness=hs.FlakinessSignal(
status="healthy",
recent_failures=0,
recent_cycles=10,
failure_rate=0.0,
message="All good",
),
tokens=hs.TokenEconomySignal(status="balanced", message="Balanced"),
)
with patch.object(orch, "_generate_health_snapshot", return_value=snapshot):
orch.run_health_snapshot(args)
captured = capsys.readouterr()
assert "P0" in captured.out
class TestMainHealthCheckIntegration:
"""Test that main() runs health snapshot before any coding work."""
def _patch_gitea_unavailable(self):
return patch.object(orch.GiteaClient, "is_available", return_value=False)
def test_main_runs_health_check_before_gitea(self):
"""Health snapshot is called before Gitea client work."""
call_order = []
def fake_snapshot(*_a, **_kw):
call_order.append("health")
return _make_snapshot("green")
def fake_gitea_available(self):
call_order.append("gitea")
return False
args = _default_args()
with (
patch.object(orch, "_generate_health_snapshot", side_effect=fake_snapshot),
patch.object(orch.GiteaClient, "is_available", fake_gitea_available),
patch("sys.argv", ["orchestrator"]),
):
orch.main()
assert call_order.index("health") < call_order.index("gitea")
def test_main_aborts_on_red_before_gitea(self):
"""main() aborts with non-zero exit code when health is red."""
gitea_called = []
def fake_gitea_available(self):
gitea_called.append(True)
return True
with (
patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()),
patch.object(orch.GiteaClient, "is_available", fake_gitea_available),
patch("sys.argv", ["orchestrator"]),
):
rc = orch.main()
assert rc != 0
assert not gitea_called, "Gitea should NOT be called when health is red"
def test_main_skips_health_check_with_flag(self):
"""--skip-health-check bypasses the pre-flight snapshot."""
health_called = []
def fake_snapshot(*_a, **_kw):
health_called.append(True)
return _make_snapshot("green")
with (
patch.object(orch, "_generate_health_snapshot", side_effect=fake_snapshot),
patch.object(orch.GiteaClient, "is_available", return_value=False),
patch("sys.argv", ["orchestrator", "--skip-health-check"]),
):
orch.main()
assert not health_called, "Health snapshot should be skipped"
def test_main_force_flag_continues_despite_red(self):
"""--force allows Daily Run to continue even when health is red."""
gitea_called = []
def fake_gitea_available(self):
gitea_called.append(True)
return False # Gitea unavailable → exits early but after health check
with (
patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()),
patch.object(orch.GiteaClient, "is_available", fake_gitea_available),
patch("sys.argv", ["orchestrator", "--force"]),
):
orch.main()
# Gitea was reached despite red status because --force was passed
assert gitea_called
def test_main_json_output_on_red_includes_error(self, capsys):
"""JSON output includes error key when health is red."""
with (
patch.object(orch, "_generate_health_snapshot", return_value=_make_red_snapshot()),
patch.object(orch.GiteaClient, "is_available", return_value=True),
patch("sys.argv", ["orchestrator", "--json"]),
):
rc = orch.main()
assert rc != 0
captured = capsys.readouterr()
data = json.loads(captured.out)
assert "error" in data

View File

@@ -4,10 +4,13 @@
Connects to local Gitea, fetches candidate issues, and produces a concise agenda
plus a day summary (review mode).
The Daily Run begins with a Quick Health Snapshot (#710) to ensure mandatory
systems are green before burning cycles on work that cannot land.
Run: python3 timmy_automations/daily_run/orchestrator.py [--review]
Env: See timmy_automations/config/daily_run.json for configuration
Refs: #703
Refs: #703, #923
"""
from __future__ import annotations
@@ -30,6 +33,11 @@ sys.path.insert(
)
from utils.token_rules import TokenRules, compute_token_reward
# Health snapshot lives in the same package
from health_snapshot import generate_snapshot as _generate_health_snapshot
from health_snapshot import get_token as _hs_get_token
from health_snapshot import load_config as _hs_load_config
# ── Configuration ─────────────────────────────────────────────────────────
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
@@ -495,6 +503,16 @@ def parse_args() -> argparse.Namespace:
default=None,
help="Override max agenda items",
)
p.add_argument(
"--skip-health-check",
action="store_true",
help="Skip the pre-flight health snapshot (not recommended)",
)
p.add_argument(
"--force",
action="store_true",
help="Continue even if health snapshot is red (overrides abort-on-red)",
)
return p.parse_args()
@@ -535,6 +553,76 @@ def compute_daily_run_tokens(success: bool = True) -> dict[str, Any]:
}
def run_health_snapshot(args: argparse.Namespace) -> int:
"""Run pre-flight health snapshot and return 0 (ok) or 1 (abort).
Prints a concise summary of CI, issues, flakiness, and token economy.
Returns 1 if the overall status is red AND --force was not passed.
Returns 0 for green/yellow or when --force is active.
On any import/runtime error the check is skipped with a warning.
"""
try:
hs_config = _hs_load_config()
hs_token = _hs_get_token(hs_config)
snapshot = _generate_health_snapshot(hs_config, hs_token)
except Exception as exc: # noqa: BLE001
print(f"[health] Warning: health snapshot failed ({exc}) — skipping", file=sys.stderr)
return 0
# Print concise pre-flight header
status_emoji = {"green": "🟢", "yellow": "🟡", "red": "🔴"}.get(
snapshot.overall_status, ""
)
print("" * 60)
print(f"PRE-FLIGHT HEALTH CHECK {status_emoji} {snapshot.overall_status.upper()}")
print("" * 60)
ci_emoji = {"pass": "", "fail": "", "unknown": "⚠️", "unavailable": ""}.get(
snapshot.ci.status, ""
)
print(f" {ci_emoji} CI: {snapshot.ci.message}")
if snapshot.issues.p0_count > 0:
issue_emoji = "🔴"
elif snapshot.issues.p1_count > 0:
issue_emoji = "🟡"
else:
issue_emoji = ""
critical_str = f"{snapshot.issues.count} critical"
if snapshot.issues.p0_count:
critical_str += f" (P0: {snapshot.issues.p0_count})"
if snapshot.issues.p1_count:
critical_str += f" (P1: {snapshot.issues.p1_count})"
print(f" {issue_emoji} Issues: {critical_str}")
flak_emoji = {"healthy": "", "degraded": "🟡", "critical": "🔴", "unknown": ""}.get(
snapshot.flakiness.status, ""
)
print(f" {flak_emoji} Flakiness: {snapshot.flakiness.message}")
token_emoji = {"balanced": "", "inflationary": "🟡", "deflationary": "🔵", "unknown": ""}.get(
snapshot.tokens.status, ""
)
print(f" {token_emoji} Tokens: {snapshot.tokens.message}")
print()
if snapshot.overall_status == "red" and not args.force:
print(
"🛑 Health status is RED — aborting Daily Run to avoid burning cycles.",
file=sys.stderr,
)
print(
" Fix the issues above or re-run with --force to override.",
file=sys.stderr,
)
return 1
if snapshot.overall_status == "red":
print("⚠️ Health is RED but --force passed — proceeding anyway.", file=sys.stderr)
return 0
def main() -> int:
args = parse_args()
config = load_config()
@@ -542,6 +630,15 @@ def main() -> int:
if args.max_items:
config["max_agenda_items"] = args.max_items
# ── Step 0: Pre-flight health snapshot ──────────────────────────────────
if not args.skip_health_check:
health_rc = run_health_snapshot(args)
if health_rc != 0:
tokens = compute_daily_run_tokens(success=False)
if args.json:
print(json.dumps({"error": "health_check_failed", "tokens": tokens}))
return health_rc
token = get_token(config)
client = GiteaClient(config, token)