Adds vLLM (high-throughput OpenAI-compatible inference server) as a selectable backend alongside the existing Ollama and vllm-mlx backends. vLLM's continuous batching gives 3-10x throughput for agentic workloads. Changes: - config.py: add `vllm` to timmy_model_backend Literal; add vllm_url / vllm_model settings (VLLM_URL / VLLM_MODEL env vars) - cascade.py: add vllm provider type with _check_provider_available (hits /health) and _call_vllm (OpenAI-compatible completions) - providers.yaml: add disabled-by-default vllm-local provider (priority 3, port 8001); bump OpenAI/Anthropic backup priorities to 4/5 - health.py: add _check_vllm/_check_vllm_sync with 30-second TTL cache; /health and /health/sovereignty reflect vLLM status when it is the active backend - docker-compose.yml: add vllm service behind 'vllm' profile (GPU passthrough commented-out template included); add vllm-cache volume - CLAUDE.md: add vLLM row to Service Fallback Matrix - tests: 26 new unit tests covering availability checks, _call_vllm, providers.yaml validation, config options, and health helpers Graceful fallback: if vLLM is unavailable the cascade router automatically falls back to Ollama. The app never crashes. Fixes #1281 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
295 lines
12 KiB
YAML
295 lines
12 KiB
YAML
# Cascade LLM Router Configuration
|
||
# Providers are tried in priority order (1 = highest)
|
||
# On failure, automatically falls back to next provider
|
||
|
||
cascade:
|
||
# Timeout settings
|
||
timeout_seconds: 30
|
||
|
||
# Retry settings
|
||
max_retries_per_provider: 2
|
||
retry_delay_seconds: 1
|
||
|
||
# Circuit breaker settings
|
||
circuit_breaker:
|
||
failure_threshold: 5 # Open circuit after 5 failures
|
||
recovery_timeout: 60 # Try again after 60 seconds
|
||
half_open_max_calls: 2 # Allow 2 test calls when half-open
|
||
|
||
providers:
|
||
# Primary: Local Ollama (always try first for sovereignty)
|
||
- name: ollama-local
|
||
type: ollama
|
||
enabled: true
|
||
priority: 1
|
||
tier: local
|
||
url: "http://localhost:11434"
|
||
models:
|
||
# ── Dual-model routing: Qwen3-8B (fast) + Qwen3-14B (quality) ──────────
|
||
# Both models fit simultaneously: ~6.6 GB + ~10.5 GB = ~17 GB combined.
|
||
# Requires OLLAMA_MAX_LOADED_MODELS=2 (set in .env) to stay hot.
|
||
# Ref: issue #1065 — Qwen3-8B/14B dual-model routing strategy
|
||
- name: qwen3:8b
|
||
context_window: 32768
|
||
capabilities: [text, tools, json, streaming, routine]
|
||
description: "Qwen3-8B Q6_K — fast router for routine tasks (~6.6 GB, 45-55 tok/s)"
|
||
- name: qwen3:14b
|
||
context_window: 40960
|
||
capabilities: [text, tools, json, streaming, complex, reasoning]
|
||
description: "Qwen3-14B Q5_K_M — complex reasoning and planning (~10.5 GB, 20-28 tok/s)"
|
||
|
||
# Text + Tools models
|
||
- name: qwen3:30b
|
||
default: true
|
||
context_window: 128000
|
||
# Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM
|
||
capabilities: [text, tools, json, streaming]
|
||
- name: llama3.1:8b-instruct
|
||
context_window: 128000
|
||
capabilities: [text, tools, json, streaming]
|
||
- name: llama3.2:3b
|
||
context_window: 128000
|
||
capabilities: [text, tools, json, streaming, vision]
|
||
- name: qwen2.5:14b
|
||
context_window: 32000
|
||
capabilities: [text, tools, json, streaming]
|
||
- name: deepseek-r1:1.5b
|
||
context_window: 32000
|
||
capabilities: [text, json, streaming]
|
||
|
||
# Vision models
|
||
- name: llava:7b
|
||
context_window: 4096
|
||
capabilities: [text, vision, streaming]
|
||
- name: qwen2.5-vl:3b
|
||
context_window: 32000
|
||
capabilities: [text, vision, tools, json, streaming]
|
||
- name: moondream:1.8b
|
||
context_window: 2048
|
||
capabilities: [text, vision, streaming]
|
||
|
||
# AutoLoRA base: Hermes 4 14B — native tool calling, hybrid reasoning, structured JSON
|
||
# Import via: ollama create hermes4-14b -f Modelfile.hermes4-14b
|
||
# See Modelfile.hermes4-14b for GGUF download instructions (Project Bannerlord #1101)
|
||
- name: hermes4-14b
|
||
context_window: 32768
|
||
capabilities: [text, tools, json, streaming, reasoning]
|
||
description: "NousResearch Hermes 4 14B — AutoLoRA base (Q5_K_M, ~11 GB)"
|
||
|
||
# AutoLoRA fine-tuned: Timmy — Hermes 4 14B + Timmy LoRA adapter (Project Bannerlord #1104)
|
||
# Build via: ./scripts/fuse_and_load.sh (fuses adapter, converts to GGUF, imports)
|
||
# Then switch harness: hermes model timmy
|
||
# Validate: python scripts/test_timmy_skills.py
|
||
- name: timmy
|
||
context_window: 32768
|
||
capabilities: [text, tools, json, streaming, reasoning]
|
||
description: "Timmy — Hermes 4 14B fine-tuned on Timmy skill set (LoRA-fused, Q5_K_M, ~11 GB)"
|
||
|
||
# AutoLoRA stretch goal: Hermes 4.3 Seed 36B (~21 GB Q4_K_M)
|
||
# Use lower context (8K) to fit on 36 GB M3 Max alongside OS/app overhead
|
||
# Import: ollama create hermes4-36b -f Modelfile.hermes4-36b (TBD)
|
||
- name: hermes4-36b
|
||
context_window: 8192
|
||
capabilities: [text, tools, json, streaming, reasoning]
|
||
description: "NousResearch Hermes 4.3 Seed 36B — stretch goal (Q4_K_M, ~21 GB)"
|
||
|
||
# Creative writing fallback (Dolphin 3.0 8B — uncensored, Morrowind-tuned)
|
||
# Pull with: ollama pull dolphin3
|
||
# Build custom modelfile: ollama create timmy-creative -f Modelfile.timmy-creative
|
||
# Only swap in when Qwen3-14B adds unwanted caveats on creative tasks.
|
||
# Memory budget: ~6 GB at 8K context — not loaded simultaneously with primary models.
|
||
- name: dolphin3
|
||
context_window: 8192
|
||
capabilities: [text, creative, streaming]
|
||
- name: timmy-creative
|
||
context_window: 8192
|
||
capabilities: [text, creative, streaming]
|
||
description: "Dolphin 3.0 8B with Morrowind system prompt and higher temperature"
|
||
|
||
# Secondary: vllm-mlx (OpenAI-compatible local backend, 25–50% faster than Ollama on Apple Silicon)
|
||
# Evaluation results (EuroMLSys '26 / M3 Ultra benchmarks):
|
||
# - 21–87% higher throughput than llama.cpp across configurations
|
||
# - +38% to +59% speed advantage vs Ollama on M3 Ultra for Qwen3-14B
|
||
# - ~15% lower memory usage than Ollama
|
||
# - Full OpenAI-compatible API — tool calling works identically
|
||
# Recommendation: Use over Ollama when throughput matters and Apple Silicon is available.
|
||
# Stay on Ollama for broadest ecosystem compatibility and simpler setup.
|
||
# To enable: start vllm-mlx server (`python -m vllm.entrypoints.openai.api_server
|
||
# --model Qwen/Qwen2.5-14B-Instruct-MLX --port 8000`) then set enabled: true.
|
||
- name: vllm-mlx-local
|
||
type: vllm_mlx
|
||
enabled: false # Enable when vllm-mlx server is running
|
||
priority: 2
|
||
tier: local
|
||
base_url: "http://localhost:8000/v1"
|
||
models:
|
||
- name: Qwen/Qwen2.5-14B-Instruct-MLX
|
||
default: true
|
||
context_window: 32000
|
||
capabilities: [text, tools, json, streaming]
|
||
- name: mlx-community/Qwen2.5-7B-Instruct-4bit
|
||
context_window: 32000
|
||
capabilities: [text, tools, json, streaming]
|
||
|
||
# Tertiary: vLLM (OpenAI-compatible, continuous batching, 3-10x agentic throughput)
|
||
# Runs on CUDA GPU or CPU. On Apple Silicon, prefer vllm-mlx-local (above).
|
||
# To enable: start vLLM server:
|
||
# python -m vllm.entrypoints.openai.api_server \
|
||
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
|
||
# Then set enabled: true (or TIMMY_LLM_BACKEND=vllm + VLLM_URL=http://localhost:8001)
|
||
- name: vllm-local
|
||
type: vllm
|
||
enabled: false # Enable when vLLM server is running
|
||
priority: 3
|
||
tier: local
|
||
base_url: "http://localhost:8001/v1"
|
||
models:
|
||
- name: Qwen/Qwen2.5-14B-Instruct
|
||
default: true
|
||
context_window: 32000
|
||
capabilities: [text, tools, json, streaming, complex]
|
||
description: "Qwen2.5-14B on vLLM — continuous batching for agentic workloads"
|
||
- name: Qwen/Qwen2.5-7B-Instruct
|
||
context_window: 32000
|
||
capabilities: [text, tools, json, streaming, routine]
|
||
description: "Qwen2.5-7B on vLLM — fast model for routine tasks"
|
||
|
||
# Quinary: OpenAI (if API key available)
|
||
- name: openai-backup
|
||
type: openai
|
||
enabled: false # Enable by setting OPENAI_API_KEY
|
||
priority: 4
|
||
tier: standard_cloud
|
||
api_key: "${OPENAI_API_KEY}" # Loaded from environment
|
||
base_url: null # Use default OpenAI endpoint
|
||
models:
|
||
- name: gpt-4o-mini
|
||
default: true
|
||
context_window: 128000
|
||
capabilities: [text, vision, tools, json, streaming]
|
||
- name: gpt-4o
|
||
context_window: 128000
|
||
capabilities: [text, vision, tools, json, streaming]
|
||
|
||
# Senary: Anthropic (if API key available)
|
||
- name: anthropic-backup
|
||
type: anthropic
|
||
enabled: false # Enable by setting ANTHROPIC_API_KEY
|
||
priority: 5
|
||
tier: frontier
|
||
api_key: "${ANTHROPIC_API_KEY}"
|
||
models:
|
||
- name: claude-3-haiku-20240307
|
||
default: true
|
||
context_window: 200000
|
||
capabilities: [text, vision, streaming]
|
||
- name: claude-3-sonnet-20240229
|
||
context_window: 200000
|
||
capabilities: [text, vision, tools, streaming]
|
||
|
||
# ── Capability-Based Fallback Chains ────────────────────────────────────────
|
||
# When a model doesn't support a required capability (e.g., vision),
|
||
# the system falls back through these chains in order.
|
||
|
||
fallback_chains:
|
||
# Vision-capable models (for image understanding)
|
||
vision:
|
||
- llama3.2:3b # Fast, good vision
|
||
- qwen2.5-vl:3b # Excellent vision, small
|
||
- llava:7b # Classic vision model
|
||
- moondream:1.8b # Tiny, fast vision
|
||
|
||
# Tool-calling models (for function calling)
|
||
tools:
|
||
- timmy # Fine-tuned Timmy (Hermes 4 14B + LoRA) — primary agent model
|
||
- hermes4-14b # Native tool calling + structured JSON (AutoLoRA base)
|
||
- llama3.1:8b-instruct # Reliable tool use
|
||
- qwen2.5:7b # Reliable tools
|
||
- llama3.2:3b # Small but capable
|
||
|
||
# General text generation (any model)
|
||
text:
|
||
- qwen3:30b
|
||
- llama3.1:8b-instruct
|
||
- qwen2.5:14b
|
||
- deepseek-r1:1.5b
|
||
- llama3.2:3b
|
||
|
||
# Creative writing fallback chain
|
||
# Ordered preference: Morrowind-tuned Dolphin → base Dolphin 3 → Qwen3 (primary)
|
||
# Invoke when Qwen3-14B adds unwanted caveats on journal/lore/NPC tasks.
|
||
creative:
|
||
- timmy-creative # dolphin3 + Morrowind system prompt (Modelfile.timmy-creative)
|
||
- dolphin3 # base Dolphin 3.0 8B (uncensored, no custom system prompt)
|
||
- qwen3:30b # primary fallback — usually sufficient with a good system prompt
|
||
|
||
# ── Complexity-based routing chains (issue #1065) ───────────────────────
|
||
# Routine tasks: prefer Qwen3-8B for low latency (~45-55 tok/s)
|
||
routine:
|
||
- qwen3:8b # Primary fast model
|
||
- llama3.1:8b-instruct # Fallback fast model
|
||
- llama3.2:3b # Smallest available
|
||
|
||
# Complex tasks: prefer Qwen3-14B for quality (~20-28 tok/s)
|
||
complex:
|
||
- qwen3:14b # Primary quality model
|
||
- hermes4-14b # Native tool calling, hybrid reasoning
|
||
- qwen3:30b # Highest local quality
|
||
- qwen2.5:14b # Additional fallback
|
||
|
||
# ── Custom Models ───────────────────────────────────────────────────────────
|
||
# Register custom model weights for per-agent assignment.
|
||
# Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs.
|
||
# Models can also be registered at runtime via the /api/v1/models API.
|
||
#
|
||
# Roles: general (default inference), reward (PRM scoring),
|
||
# teacher (distillation), judge (output evaluation)
|
||
custom_models: []
|
||
# Example entries:
|
||
# - name: my-finetuned-llama
|
||
# format: gguf
|
||
# path: /path/to/model.gguf
|
||
# role: general
|
||
# context_window: 8192
|
||
# description: "Fine-tuned Llama for code tasks"
|
||
#
|
||
# - name: reward-model
|
||
# format: ollama
|
||
# path: deepseek-r1:1.5b
|
||
# role: reward
|
||
# context_window: 32000
|
||
# description: "Process reward model for scoring outputs"
|
||
|
||
# ── Agent Model Assignments ─────────────────────────────────────────────────
|
||
# Map persona agent IDs to specific models.
|
||
# Agents without an assignment use the global default (ollama_model).
|
||
agent_model_assignments: {}
|
||
# Example:
|
||
# persona-forge: my-finetuned-llama
|
||
# persona-echo: deepseek-r1:1.5b
|
||
|
||
# ── Multi-Modal Settings ────────────────────────────────────────────────────
|
||
multimodal:
|
||
# Automatically pull models when needed
|
||
auto_pull: true
|
||
|
||
# Timeout for model pulling (seconds)
|
||
pull_timeout: 300
|
||
|
||
# Maximum fallback depth (how many models to try before giving up)
|
||
max_fallback_depth: 3
|
||
|
||
# Prefer smaller models for vision when available (faster)
|
||
prefer_small_vision: true
|
||
|
||
# Cost tracking (optional, for budget monitoring)
|
||
cost_tracking:
|
||
enabled: true
|
||
budget_daily_usd: 10.0 # Alert if daily spend exceeds this
|
||
alert_threshold_percent: 80 # Alert at 80% of budget
|
||
|
||
# Metrics retention
|
||
metrics:
|
||
retention_hours: 168 # Keep 7 days of metrics
|
||
purge_interval_hours: 24
|