# Cascade LLM Router Configuration # Providers are tried in priority order (1 = highest) # On failure, automatically falls back to next provider cascade: # Timeout settings timeout_seconds: 30 # Retry settings max_retries_per_provider: 2 retry_delay_seconds: 1 # Circuit breaker settings circuit_breaker: failure_threshold: 5 # Open circuit after 5 failures recovery_timeout: 60 # Try again after 60 seconds half_open_max_calls: 2 # Allow 2 test calls when half-open providers: # Primary: Local Ollama (always try first for sovereignty) - name: ollama-local type: ollama enabled: true priority: 1 url: "http://localhost:11434" models: # Text + Tools models - name: qwen3:30b default: true context_window: 128000 # Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM capabilities: [text, tools, json, streaming] - name: llama3.1:8b-instruct context_window: 128000 capabilities: [text, tools, json, streaming] - name: llama3.2:3b context_window: 128000 capabilities: [text, tools, json, streaming, vision] - name: qwen2.5:14b context_window: 32000 capabilities: [text, tools, json, streaming] - name: deepseek-r1:1.5b context_window: 32000 capabilities: [text, json, streaming] # Vision models - name: llava:7b context_window: 4096 capabilities: [text, vision, streaming] - name: qwen2.5-vl:3b context_window: 32000 capabilities: [text, vision, tools, json, streaming] - name: moondream:1.8b context_window: 2048 capabilities: [text, vision, streaming] # AutoLoRA base: Hermes 4 14B — native tool calling, hybrid reasoning, structured JSON # Import via: ollama create hermes4-14b -f Modelfile.hermes4-14b # See Modelfile.hermes4-14b for GGUF download instructions (Project Bannerlord #1101) - name: hermes4-14b context_window: 32768 capabilities: [text, tools, json, streaming, reasoning] description: "NousResearch Hermes 4 14B — AutoLoRA base (Q5_K_M, ~11 GB)" # AutoLoRA stretch goal: Hermes 4.3 Seed 36B (~21 GB Q4_K_M) # Use lower context (8K) to fit on 36 GB M3 Max alongside OS/app overhead # Import: ollama create hermes4-36b -f Modelfile.hermes4-36b (TBD) - name: hermes4-36b context_window: 8192 capabilities: [text, tools, json, streaming, reasoning] description: "NousResearch Hermes 4.3 Seed 36B — stretch goal (Q4_K_M, ~21 GB)" # Creative writing fallback (Dolphin 3.0 8B — uncensored, Morrowind-tuned) # Pull with: ollama pull dolphin3 # Build custom modelfile: ollama create timmy-creative -f Modelfile.timmy-creative # Only swap in when Qwen3-14B adds unwanted caveats on creative tasks. # Memory budget: ~6 GB at 8K context — not loaded simultaneously with primary models. - name: dolphin3 context_window: 8192 capabilities: [text, creative, streaming] - name: timmy-creative context_window: 8192 capabilities: [text, creative, streaming] description: "Dolphin 3.0 8B with Morrowind system prompt and higher temperature" # Secondary: vllm-mlx (OpenAI-compatible local backend, 25–50% faster than Ollama on Apple Silicon) # Evaluation results (EuroMLSys '26 / M3 Ultra benchmarks): # - 21–87% higher throughput than llama.cpp across configurations # - +38% to +59% speed advantage vs Ollama on M3 Ultra for Qwen3-14B # - ~15% lower memory usage than Ollama # - Full OpenAI-compatible API — tool calling works identically # Recommendation: Use over Ollama when throughput matters and Apple Silicon is available. # Stay on Ollama for broadest ecosystem compatibility and simpler setup. # To enable: start vllm-mlx server (`python -m vllm.entrypoints.openai.api_server # --model Qwen/Qwen2.5-14B-Instruct-MLX --port 8000`) then set enabled: true. - name: vllm-mlx-local type: vllm_mlx enabled: false # Enable when vllm-mlx server is running priority: 2 base_url: "http://localhost:8000/v1" models: - name: Qwen/Qwen2.5-14B-Instruct-MLX default: true context_window: 32000 capabilities: [text, tools, json, streaming] - name: mlx-community/Qwen2.5-7B-Instruct-4bit context_window: 32000 capabilities: [text, tools, json, streaming] # Tertiary: OpenAI (if API key available) - name: openai-backup type: openai enabled: false # Enable by setting OPENAI_API_KEY priority: 3 api_key: "${OPENAI_API_KEY}" # Loaded from environment base_url: null # Use default OpenAI endpoint models: - name: gpt-4o-mini default: true context_window: 128000 capabilities: [text, vision, tools, json, streaming] - name: gpt-4o context_window: 128000 capabilities: [text, vision, tools, json, streaming] # Quaternary: Anthropic (if API key available) - name: anthropic-backup type: anthropic enabled: false # Enable by setting ANTHROPIC_API_KEY priority: 4 api_key: "${ANTHROPIC_API_KEY}" models: - name: claude-3-haiku-20240307 default: true context_window: 200000 capabilities: [text, vision, streaming] - name: claude-3-sonnet-20240229 context_window: 200000 capabilities: [text, vision, tools, streaming] # ── Capability-Based Fallback Chains ──────────────────────────────────────── # When a model doesn't support a required capability (e.g., vision), # the system falls back through these chains in order. fallback_chains: # Vision-capable models (for image understanding) vision: - llama3.2:3b # Fast, good vision - qwen2.5-vl:3b # Excellent vision, small - llava:7b # Classic vision model - moondream:1.8b # Tiny, fast vision # Tool-calling models (for function calling) tools: - hermes4-14b # Native tool calling + structured JSON (AutoLoRA base) - llama3.1:8b-instruct # Reliable tool use - qwen2.5:7b # Reliable tools - llama3.2:3b # Small but capable # General text generation (any model) text: - qwen3:30b - llama3.1:8b-instruct - qwen2.5:14b - deepseek-r1:1.5b - llama3.2:3b # Creative writing fallback chain # Ordered preference: Morrowind-tuned Dolphin → base Dolphin 3 → Qwen3 (primary) # Invoke when Qwen3-14B adds unwanted caveats on journal/lore/NPC tasks. creative: - timmy-creative # dolphin3 + Morrowind system prompt (Modelfile.timmy-creative) - dolphin3 # base Dolphin 3.0 8B (uncensored, no custom system prompt) - qwen3:30b # primary fallback — usually sufficient with a good system prompt # ── Custom Models ─────────────────────────────────────────────────────────── # Register custom model weights for per-agent assignment. # Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs. # Models can also be registered at runtime via the /api/v1/models API. # # Roles: general (default inference), reward (PRM scoring), # teacher (distillation), judge (output evaluation) custom_models: [] # Example entries: # - name: my-finetuned-llama # format: gguf # path: /path/to/model.gguf # role: general # context_window: 8192 # description: "Fine-tuned Llama for code tasks" # # - name: reward-model # format: ollama # path: deepseek-r1:1.5b # role: reward # context_window: 32000 # description: "Process reward model for scoring outputs" # ── Agent Model Assignments ───────────────────────────────────────────────── # Map persona agent IDs to specific models. # Agents without an assignment use the global default (ollama_model). agent_model_assignments: {} # Example: # persona-forge: my-finetuned-llama # persona-echo: deepseek-r1:1.5b # ── Multi-Modal Settings ──────────────────────────────────────────────────── multimodal: # Automatically pull models when needed auto_pull: true # Timeout for model pulling (seconds) pull_timeout: 300 # Maximum fallback depth (how many models to try before giving up) max_fallback_depth: 3 # Prefer smaller models for vision when available (faster) prefer_small_vision: true # Cost tracking (optional, for budget monitoring) cost_tracking: enabled: true budget_daily_usd: 10.0 # Alert if daily spend exceeds this alert_threshold_percent: 80 # Alert at 80% of budget # Metrics retention metrics: retention_hours: 168 # Keep 7 days of metrics purge_interval_hours: 24