# Cascade LLM Router Configuration # Providers are tried in priority order (1 = highest) # On failure, automatically falls back to next provider cascade: # Timeout settings timeout_seconds: 30 # Retry settings max_retries_per_provider: 2 retry_delay_seconds: 1 # Circuit breaker settings circuit_breaker: failure_threshold: 5 # Open circuit after 5 failures recovery_timeout: 60 # Try again after 60 seconds half_open_max_calls: 2 # Allow 2 test calls when half-open providers: # Primary: Local Ollama (always try first for sovereignty) - name: ollama-local type: ollama enabled: true priority: 1 url: "http://localhost:11434" models: # Text + Tools models - name: llama3.1:8b-instruct default: true context_window: 128000 capabilities: [text, tools, json, streaming] - name: llama3.2:3b context_window: 128000 capabilities: [text, tools, json, streaming, vision] - name: qwen2.5:14b context_window: 32000 capabilities: [text, tools, json, streaming] - name: deepseek-r1:1.5b context_window: 32000 capabilities: [text, json, streaming] # Vision models - name: llava:7b context_window: 4096 capabilities: [text, vision, streaming] - name: qwen2.5-vl:3b context_window: 32000 capabilities: [text, vision, tools, json, streaming] - name: moondream:1.8b context_window: 2048 capabilities: [text, vision, streaming] # Secondary: Local AirLLM (if installed) - name: airllm-local type: airllm enabled: false # Enable if pip install airllm priority: 2 models: - name: 70b default: true capabilities: [text, tools, json, streaming] - name: 8b capabilities: [text, tools, json, streaming] - name: 405b capabilities: [text, tools, json, streaming] # Tertiary: OpenAI (if API key available) - name: openai-backup type: openai enabled: false # Enable by setting OPENAI_API_KEY priority: 3 api_key: "${OPENAI_API_KEY}" # Loaded from environment base_url: null # Use default OpenAI endpoint models: - name: gpt-4o-mini default: true context_window: 128000 capabilities: [text, vision, tools, json, streaming] - name: gpt-4o context_window: 128000 capabilities: [text, vision, tools, json, streaming] # Quaternary: Anthropic (if API key available) - name: anthropic-backup type: anthropic enabled: false # Enable by setting ANTHROPIC_API_KEY priority: 4 api_key: "${ANTHROPIC_API_KEY}" models: - name: claude-3-haiku-20240307 default: true context_window: 200000 capabilities: [text, vision, streaming] - name: claude-3-sonnet-20240229 context_window: 200000 capabilities: [text, vision, tools, streaming] # ── Capability-Based Fallback Chains ──────────────────────────────────────── # When a model doesn't support a required capability (e.g., vision), # the system falls back through these chains in order. fallback_chains: # Vision-capable models (for image understanding) vision: - llama3.2:3b # Fast, good vision - qwen2.5-vl:3b # Excellent vision, small - llava:7b # Classic vision model - moondream:1.8b # Tiny, fast vision # Tool-calling models (for function calling) tools: - llama3.1:8b-instruct # Best tool use - qwen2.5:7b # Reliable tools - llama3.2:3b # Small but capable # General text generation (any model) text: - llama3.1:8b-instruct - qwen2.5:14b - deepseek-r1:1.5b - llama3.2:3b # ── Custom Models ─────────────────────────────────────────────────────────── # Register custom model weights for per-agent assignment. # Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs. # Models can also be registered at runtime via the /api/v1/models API. # # Roles: general (default inference), reward (PRM scoring), # teacher (distillation), judge (output evaluation) custom_models: [] # Example entries: # - name: my-finetuned-llama # format: gguf # path: /path/to/model.gguf # role: general # context_window: 8192 # description: "Fine-tuned Llama for code tasks" # # - name: reward-model # format: ollama # path: deepseek-r1:1.5b # role: reward # context_window: 32000 # description: "Process reward model for scoring outputs" # ── Agent Model Assignments ───────────────────────────────────────────────── # Map persona agent IDs to specific models. # Agents without an assignment use the global default (ollama_model). agent_model_assignments: {} # Example: # persona-forge: my-finetuned-llama # persona-echo: deepseek-r1:1.5b # ── Multi-Modal Settings ──────────────────────────────────────────────────── multimodal: # Automatically pull models when needed auto_pull: true # Timeout for model pulling (seconds) pull_timeout: 300 # Maximum fallback depth (how many models to try before giving up) max_fallback_depth: 3 # Prefer smaller models for vision when available (faster) prefer_small_vision: true # Cost tracking (optional, for budget monitoring) cost_tracking: enabled: true budget_daily_usd: 10.0 # Alert if daily spend exceeds this alert_threshold_percent: 80 # Alert at 80% of budget # Metrics retention metrics: retention_hours: 168 # Keep 7 days of metrics purge_interval_hours: 24