183 lines
6.1 KiB
YAML
183 lines
6.1 KiB
YAML
# Cascade LLM Router Configuration
|
|
# Providers are tried in priority order (1 = highest)
|
|
# On failure, automatically falls back to next provider
|
|
|
|
cascade:
|
|
# Timeout settings
|
|
timeout_seconds: 30
|
|
|
|
# Retry settings
|
|
max_retries_per_provider: 2
|
|
retry_delay_seconds: 1
|
|
|
|
# Circuit breaker settings
|
|
circuit_breaker:
|
|
failure_threshold: 5 # Open circuit after 5 failures
|
|
recovery_timeout: 60 # Try again after 60 seconds
|
|
half_open_max_calls: 2 # Allow 2 test calls when half-open
|
|
|
|
providers:
|
|
# Primary: Local Ollama (always try first for sovereignty)
|
|
- name: ollama-local
|
|
type: ollama
|
|
enabled: true
|
|
priority: 1
|
|
url: "http://localhost:11434"
|
|
models:
|
|
# Text + Tools models
|
|
- name: qwen3:30b
|
|
default: true
|
|
context_window: 128000
|
|
# Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM
|
|
capabilities: [text, tools, json, streaming]
|
|
- name: llama3.1:8b-instruct
|
|
context_window: 128000
|
|
capabilities: [text, tools, json, streaming]
|
|
- name: llama3.2:3b
|
|
context_window: 128000
|
|
capabilities: [text, tools, json, streaming, vision]
|
|
- name: qwen2.5:14b
|
|
context_window: 32000
|
|
capabilities: [text, tools, json, streaming]
|
|
- name: deepseek-r1:1.5b
|
|
context_window: 32000
|
|
capabilities: [text, json, streaming]
|
|
|
|
# Vision models
|
|
- name: llava:7b
|
|
context_window: 4096
|
|
capabilities: [text, vision, streaming]
|
|
- name: qwen2.5-vl:3b
|
|
context_window: 32000
|
|
capabilities: [text, vision, tools, json, streaming]
|
|
- name: moondream:1.8b
|
|
context_window: 2048
|
|
capabilities: [text, vision, streaming]
|
|
|
|
# Secondary: Local AirLLM (if installed)
|
|
- name: airllm-local
|
|
type: airllm
|
|
enabled: false # Enable if pip install airllm
|
|
priority: 2
|
|
models:
|
|
- name: 70b
|
|
default: true
|
|
capabilities: [text, tools, json, streaming]
|
|
- name: 8b
|
|
capabilities: [text, tools, json, streaming]
|
|
- name: 405b
|
|
capabilities: [text, tools, json, streaming]
|
|
|
|
# Tertiary: OpenAI (if API key available)
|
|
- name: openai-backup
|
|
type: openai
|
|
enabled: false # Enable by setting OPENAI_API_KEY
|
|
priority: 3
|
|
api_key: "${OPENAI_API_KEY}" # Loaded from environment
|
|
base_url: null # Use default OpenAI endpoint
|
|
models:
|
|
- name: gpt-4o-mini
|
|
default: true
|
|
context_window: 128000
|
|
capabilities: [text, vision, tools, json, streaming]
|
|
- name: gpt-4o
|
|
context_window: 128000
|
|
capabilities: [text, vision, tools, json, streaming]
|
|
|
|
# Quaternary: Anthropic (if API key available)
|
|
- name: anthropic-backup
|
|
type: anthropic
|
|
enabled: false # Enable by setting ANTHROPIC_API_KEY
|
|
priority: 4
|
|
api_key: "${ANTHROPIC_API_KEY}"
|
|
models:
|
|
- name: claude-3-haiku-20240307
|
|
default: true
|
|
context_window: 200000
|
|
capabilities: [text, vision, streaming]
|
|
- name: claude-3-sonnet-20240229
|
|
context_window: 200000
|
|
capabilities: [text, vision, tools, streaming]
|
|
|
|
# ── Capability-Based Fallback Chains ────────────────────────────────────────
|
|
# When a model doesn't support a required capability (e.g., vision),
|
|
# the system falls back through these chains in order.
|
|
|
|
fallback_chains:
|
|
# Vision-capable models (for image understanding)
|
|
vision:
|
|
- llama3.2:3b # Fast, good vision
|
|
- qwen2.5-vl:3b # Excellent vision, small
|
|
- llava:7b # Classic vision model
|
|
- moondream:1.8b # Tiny, fast vision
|
|
|
|
# Tool-calling models (for function calling)
|
|
tools:
|
|
- llama3.1:8b-instruct # Best tool use
|
|
- qwen2.5:7b # Reliable tools
|
|
- llama3.2:3b # Small but capable
|
|
|
|
# General text generation (any model)
|
|
text:
|
|
- qwen3:30b
|
|
- llama3.1:8b-instruct
|
|
- qwen2.5:14b
|
|
- deepseek-r1:1.5b
|
|
- llama3.2:3b
|
|
|
|
# ── Custom Models ───────────────────────────────────────────────────────────
|
|
# Register custom model weights for per-agent assignment.
|
|
# Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs.
|
|
# Models can also be registered at runtime via the /api/v1/models API.
|
|
#
|
|
# Roles: general (default inference), reward (PRM scoring),
|
|
# teacher (distillation), judge (output evaluation)
|
|
custom_models: []
|
|
# Example entries:
|
|
# - name: my-finetuned-llama
|
|
# format: gguf
|
|
# path: /path/to/model.gguf
|
|
# role: general
|
|
# context_window: 8192
|
|
# description: "Fine-tuned Llama for code tasks"
|
|
#
|
|
# - name: reward-model
|
|
# format: ollama
|
|
# path: deepseek-r1:1.5b
|
|
# role: reward
|
|
# context_window: 32000
|
|
# description: "Process reward model for scoring outputs"
|
|
|
|
# ── Agent Model Assignments ─────────────────────────────────────────────────
|
|
# Map persona agent IDs to specific models.
|
|
# Agents without an assignment use the global default (ollama_model).
|
|
agent_model_assignments: {}
|
|
# Example:
|
|
# persona-forge: my-finetuned-llama
|
|
# persona-echo: deepseek-r1:1.5b
|
|
|
|
# ── Multi-Modal Settings ────────────────────────────────────────────────────
|
|
multimodal:
|
|
# Automatically pull models when needed
|
|
auto_pull: true
|
|
|
|
# Timeout for model pulling (seconds)
|
|
pull_timeout: 300
|
|
|
|
# Maximum fallback depth (how many models to try before giving up)
|
|
max_fallback_depth: 3
|
|
|
|
# Prefer smaller models for vision when available (faster)
|
|
prefer_small_vision: true
|
|
|
|
# Cost tracking (optional, for budget monitoring)
|
|
cost_tracking:
|
|
enabled: true
|
|
budget_daily_usd: 10.0 # Alert if daily spend exceeds this
|
|
alert_threshold_percent: 80 # Alert at 80% of budget
|
|
|
|
# Metrics retention
|
|
metrics:
|
|
retention_hours: 168 # Keep 7 days of metrics
|
|
purge_interval_hours: 24
|