2026-02-25 19:43:43 -05:00
|
|
|
|
# Cascade LLM Router Configuration
|
|
|
|
|
|
# Providers are tried in priority order (1 = highest)
|
|
|
|
|
|
# On failure, automatically falls back to next provider
|
|
|
|
|
|
|
|
|
|
|
|
cascade:
|
|
|
|
|
|
# Timeout settings
|
|
|
|
|
|
timeout_seconds: 30
|
|
|
|
|
|
|
|
|
|
|
|
# Retry settings
|
|
|
|
|
|
max_retries_per_provider: 2
|
|
|
|
|
|
retry_delay_seconds: 1
|
|
|
|
|
|
|
|
|
|
|
|
# Circuit breaker settings
|
|
|
|
|
|
circuit_breaker:
|
|
|
|
|
|
failure_threshold: 5 # Open circuit after 5 failures
|
|
|
|
|
|
recovery_timeout: 60 # Try again after 60 seconds
|
|
|
|
|
|
half_open_max_calls: 2 # Allow 2 test calls when half-open
|
|
|
|
|
|
|
|
|
|
|
|
providers:
|
|
|
|
|
|
# Primary: Local Ollama (always try first for sovereignty)
|
|
|
|
|
|
- name: ollama-local
|
|
|
|
|
|
type: ollama
|
|
|
|
|
|
enabled: true
|
|
|
|
|
|
priority: 1
|
2026-03-23 18:21:18 +00:00
|
|
|
|
tier: local
|
2026-02-25 19:43:43 -05:00
|
|
|
|
url: "http://localhost:11434"
|
|
|
|
|
|
models:
|
2026-03-23 22:58:21 +00:00
|
|
|
|
# ── Dual-model routing: Qwen3-8B (fast) + Qwen3-14B (quality) ──────────
|
|
|
|
|
|
# Both models fit simultaneously: ~6.6 GB + ~10.5 GB = ~17 GB combined.
|
|
|
|
|
|
# Requires OLLAMA_MAX_LOADED_MODELS=2 (set in .env) to stay hot.
|
|
|
|
|
|
# Ref: issue #1065 — Qwen3-8B/14B dual-model routing strategy
|
|
|
|
|
|
- name: qwen3:8b
|
|
|
|
|
|
context_window: 32768
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, routine]
|
|
|
|
|
|
description: "Qwen3-8B Q6_K — fast router for routine tasks (~6.6 GB, 45-55 tok/s)"
|
|
|
|
|
|
- name: qwen3:14b
|
|
|
|
|
|
context_window: 40960
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, complex, reasoning]
|
|
|
|
|
|
description: "Qwen3-14B Q5_K_M — complex reasoning and planning (~10.5 GB, 20-28 tok/s)"
|
|
|
|
|
|
|
2026-02-26 22:29:44 -05:00
|
|
|
|
# Text + Tools models
|
2026-03-15 12:34:21 -04:00
|
|
|
|
- name: qwen3:30b
|
2026-02-25 19:43:43 -05:00
|
|
|
|
default: true
|
|
|
|
|
|
context_window: 128000
|
2026-03-14 18:54:43 -04:00
|
|
|
|
# Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, tools, json, streaming]
|
2026-03-11 16:55:27 -04:00
|
|
|
|
- name: llama3.1:8b-instruct
|
|
|
|
|
|
context_window: 128000
|
|
|
|
|
|
capabilities: [text, tools, json, streaming]
|
2026-02-26 22:29:44 -05:00
|
|
|
|
- name: llama3.2:3b
|
|
|
|
|
|
context_window: 128000
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, vision]
|
|
|
|
|
|
- name: qwen2.5:14b
|
|
|
|
|
|
context_window: 32000
|
|
|
|
|
|
capabilities: [text, tools, json, streaming]
|
2026-02-25 19:43:43 -05:00
|
|
|
|
- name: deepseek-r1:1.5b
|
|
|
|
|
|
context_window: 32000
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, json, streaming]
|
|
|
|
|
|
|
|
|
|
|
|
# Vision models
|
|
|
|
|
|
- name: llava:7b
|
|
|
|
|
|
context_window: 4096
|
|
|
|
|
|
capabilities: [text, vision, streaming]
|
|
|
|
|
|
- name: qwen2.5-vl:3b
|
|
|
|
|
|
context_window: 32000
|
|
|
|
|
|
capabilities: [text, vision, tools, json, streaming]
|
|
|
|
|
|
- name: moondream:1.8b
|
|
|
|
|
|
context_window: 2048
|
|
|
|
|
|
capabilities: [text, vision, streaming]
|
2026-03-23 15:25:06 +00:00
|
|
|
|
|
2026-03-23 17:59:45 +00:00
|
|
|
|
# AutoLoRA base: Hermes 4 14B — native tool calling, hybrid reasoning, structured JSON
|
|
|
|
|
|
# Import via: ollama create hermes4-14b -f Modelfile.hermes4-14b
|
|
|
|
|
|
# See Modelfile.hermes4-14b for GGUF download instructions (Project Bannerlord #1101)
|
|
|
|
|
|
- name: hermes4-14b
|
|
|
|
|
|
context_window: 32768
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, reasoning]
|
|
|
|
|
|
description: "NousResearch Hermes 4 14B — AutoLoRA base (Q5_K_M, ~11 GB)"
|
|
|
|
|
|
|
2026-03-23 18:21:32 +00:00
|
|
|
|
# AutoLoRA fine-tuned: Timmy — Hermes 4 14B + Timmy LoRA adapter (Project Bannerlord #1104)
|
|
|
|
|
|
# Build via: ./scripts/fuse_and_load.sh (fuses adapter, converts to GGUF, imports)
|
|
|
|
|
|
# Then switch harness: hermes model timmy
|
|
|
|
|
|
# Validate: python scripts/test_timmy_skills.py
|
|
|
|
|
|
- name: timmy
|
|
|
|
|
|
context_window: 32768
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, reasoning]
|
|
|
|
|
|
description: "Timmy — Hermes 4 14B fine-tuned on Timmy skill set (LoRA-fused, Q5_K_M, ~11 GB)"
|
|
|
|
|
|
|
2026-03-23 17:59:45 +00:00
|
|
|
|
# AutoLoRA stretch goal: Hermes 4.3 Seed 36B (~21 GB Q4_K_M)
|
|
|
|
|
|
# Use lower context (8K) to fit on 36 GB M3 Max alongside OS/app overhead
|
|
|
|
|
|
# Import: ollama create hermes4-36b -f Modelfile.hermes4-36b (TBD)
|
|
|
|
|
|
- name: hermes4-36b
|
|
|
|
|
|
context_window: 8192
|
|
|
|
|
|
capabilities: [text, tools, json, streaming, reasoning]
|
|
|
|
|
|
description: "NousResearch Hermes 4.3 Seed 36B — stretch goal (Q4_K_M, ~21 GB)"
|
|
|
|
|
|
|
2026-03-23 15:25:06 +00:00
|
|
|
|
# Creative writing fallback (Dolphin 3.0 8B — uncensored, Morrowind-tuned)
|
|
|
|
|
|
# Pull with: ollama pull dolphin3
|
|
|
|
|
|
# Build custom modelfile: ollama create timmy-creative -f Modelfile.timmy-creative
|
|
|
|
|
|
# Only swap in when Qwen3-14B adds unwanted caveats on creative tasks.
|
|
|
|
|
|
# Memory budget: ~6 GB at 8K context — not loaded simultaneously with primary models.
|
|
|
|
|
|
- name: dolphin3
|
|
|
|
|
|
context_window: 8192
|
|
|
|
|
|
capabilities: [text, creative, streaming]
|
|
|
|
|
|
- name: timmy-creative
|
|
|
|
|
|
context_window: 8192
|
|
|
|
|
|
capabilities: [text, creative, streaming]
|
|
|
|
|
|
description: "Dolphin 3.0 8B with Morrowind system prompt and higher temperature"
|
|
|
|
|
|
|
2026-03-23 15:34:13 +00:00
|
|
|
|
# Secondary: vllm-mlx (OpenAI-compatible local backend, 25–50% faster than Ollama on Apple Silicon)
|
|
|
|
|
|
# Evaluation results (EuroMLSys '26 / M3 Ultra benchmarks):
|
|
|
|
|
|
# - 21–87% higher throughput than llama.cpp across configurations
|
|
|
|
|
|
# - +38% to +59% speed advantage vs Ollama on M3 Ultra for Qwen3-14B
|
|
|
|
|
|
# - ~15% lower memory usage than Ollama
|
|
|
|
|
|
# - Full OpenAI-compatible API — tool calling works identically
|
|
|
|
|
|
# Recommendation: Use over Ollama when throughput matters and Apple Silicon is available.
|
|
|
|
|
|
# Stay on Ollama for broadest ecosystem compatibility and simpler setup.
|
|
|
|
|
|
# To enable: start vllm-mlx server (`python -m vllm.entrypoints.openai.api_server
|
|
|
|
|
|
# --model Qwen/Qwen2.5-14B-Instruct-MLX --port 8000`) then set enabled: true.
|
|
|
|
|
|
- name: vllm-mlx-local
|
|
|
|
|
|
type: vllm_mlx
|
|
|
|
|
|
enabled: false # Enable when vllm-mlx server is running
|
|
|
|
|
|
priority: 2
|
2026-03-23 18:21:18 +00:00
|
|
|
|
tier: local
|
2026-03-23 15:34:13 +00:00
|
|
|
|
base_url: "http://localhost:8000/v1"
|
|
|
|
|
|
models:
|
|
|
|
|
|
- name: Qwen/Qwen2.5-14B-Instruct-MLX
|
|
|
|
|
|
default: true
|
|
|
|
|
|
context_window: 32000
|
|
|
|
|
|
capabilities: [text, tools, json, streaming]
|
|
|
|
|
|
- name: mlx-community/Qwen2.5-7B-Instruct-4bit
|
|
|
|
|
|
context_window: 32000
|
|
|
|
|
|
capabilities: [text, tools, json, streaming]
|
2026-03-23 15:25:06 +00:00
|
|
|
|
|
2026-02-25 19:43:43 -05:00
|
|
|
|
# Tertiary: OpenAI (if API key available)
|
|
|
|
|
|
- name: openai-backup
|
|
|
|
|
|
type: openai
|
|
|
|
|
|
enabled: false # Enable by setting OPENAI_API_KEY
|
|
|
|
|
|
priority: 3
|
2026-03-23 18:21:18 +00:00
|
|
|
|
tier: standard_cloud
|
2026-02-25 19:43:43 -05:00
|
|
|
|
api_key: "${OPENAI_API_KEY}" # Loaded from environment
|
|
|
|
|
|
base_url: null # Use default OpenAI endpoint
|
|
|
|
|
|
models:
|
|
|
|
|
|
- name: gpt-4o-mini
|
|
|
|
|
|
default: true
|
|
|
|
|
|
context_window: 128000
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, vision, tools, json, streaming]
|
2026-02-25 19:43:43 -05:00
|
|
|
|
- name: gpt-4o
|
|
|
|
|
|
context_window: 128000
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, vision, tools, json, streaming]
|
2026-02-25 19:43:43 -05:00
|
|
|
|
|
|
|
|
|
|
# Quaternary: Anthropic (if API key available)
|
|
|
|
|
|
- name: anthropic-backup
|
|
|
|
|
|
type: anthropic
|
|
|
|
|
|
enabled: false # Enable by setting ANTHROPIC_API_KEY
|
|
|
|
|
|
priority: 4
|
2026-03-23 18:21:18 +00:00
|
|
|
|
tier: frontier
|
2026-02-25 19:43:43 -05:00
|
|
|
|
api_key: "${ANTHROPIC_API_KEY}"
|
|
|
|
|
|
models:
|
|
|
|
|
|
- name: claude-3-haiku-20240307
|
|
|
|
|
|
default: true
|
|
|
|
|
|
context_window: 200000
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, vision, streaming]
|
2026-02-25 19:43:43 -05:00
|
|
|
|
- name: claude-3-sonnet-20240229
|
|
|
|
|
|
context_window: 200000
|
2026-02-26 22:29:44 -05:00
|
|
|
|
capabilities: [text, vision, tools, streaming]
|
2026-02-25 19:43:43 -05:00
|
|
|
|
|
2026-02-26 22:29:44 -05:00
|
|
|
|
# ── Capability-Based Fallback Chains ────────────────────────────────────────
|
|
|
|
|
|
# When a model doesn't support a required capability (e.g., vision),
|
|
|
|
|
|
# the system falls back through these chains in order.
|
|
|
|
|
|
|
|
|
|
|
|
fallback_chains:
|
|
|
|
|
|
# Vision-capable models (for image understanding)
|
|
|
|
|
|
vision:
|
|
|
|
|
|
- llama3.2:3b # Fast, good vision
|
|
|
|
|
|
- qwen2.5-vl:3b # Excellent vision, small
|
|
|
|
|
|
- llava:7b # Classic vision model
|
|
|
|
|
|
- moondream:1.8b # Tiny, fast vision
|
|
|
|
|
|
|
|
|
|
|
|
# Tool-calling models (for function calling)
|
|
|
|
|
|
tools:
|
2026-03-23 18:21:32 +00:00
|
|
|
|
- timmy # Fine-tuned Timmy (Hermes 4 14B + LoRA) — primary agent model
|
2026-03-23 17:59:45 +00:00
|
|
|
|
- hermes4-14b # Native tool calling + structured JSON (AutoLoRA base)
|
|
|
|
|
|
- llama3.1:8b-instruct # Reliable tool use
|
2026-02-26 22:29:44 -05:00
|
|
|
|
- qwen2.5:7b # Reliable tools
|
|
|
|
|
|
- llama3.2:3b # Small but capable
|
|
|
|
|
|
|
|
|
|
|
|
# General text generation (any model)
|
|
|
|
|
|
text:
|
2026-03-15 12:34:21 -04:00
|
|
|
|
- qwen3:30b
|
2026-02-26 22:29:44 -05:00
|
|
|
|
- llama3.1:8b-instruct
|
|
|
|
|
|
- qwen2.5:14b
|
|
|
|
|
|
- deepseek-r1:1.5b
|
|
|
|
|
|
- llama3.2:3b
|
|
|
|
|
|
|
2026-03-23 15:25:06 +00:00
|
|
|
|
# Creative writing fallback chain
|
|
|
|
|
|
# Ordered preference: Morrowind-tuned Dolphin → base Dolphin 3 → Qwen3 (primary)
|
|
|
|
|
|
# Invoke when Qwen3-14B adds unwanted caveats on journal/lore/NPC tasks.
|
|
|
|
|
|
creative:
|
|
|
|
|
|
- timmy-creative # dolphin3 + Morrowind system prompt (Modelfile.timmy-creative)
|
|
|
|
|
|
- dolphin3 # base Dolphin 3.0 8B (uncensored, no custom system prompt)
|
|
|
|
|
|
- qwen3:30b # primary fallback — usually sufficient with a good system prompt
|
|
|
|
|
|
|
2026-03-23 22:58:21 +00:00
|
|
|
|
# ── Complexity-based routing chains (issue #1065) ───────────────────────
|
|
|
|
|
|
# Routine tasks: prefer Qwen3-8B for low latency (~45-55 tok/s)
|
|
|
|
|
|
routine:
|
|
|
|
|
|
- qwen3:8b # Primary fast model
|
|
|
|
|
|
- llama3.1:8b-instruct # Fallback fast model
|
|
|
|
|
|
- llama3.2:3b # Smallest available
|
|
|
|
|
|
|
|
|
|
|
|
# Complex tasks: prefer Qwen3-14B for quality (~20-28 tok/s)
|
|
|
|
|
|
complex:
|
|
|
|
|
|
- qwen3:14b # Primary quality model
|
|
|
|
|
|
- hermes4-14b # Native tool calling, hybrid reasoning
|
|
|
|
|
|
- qwen3:30b # Highest local quality
|
|
|
|
|
|
- qwen2.5:14b # Additional fallback
|
|
|
|
|
|
|
2026-02-26 22:29:44 -05:00
|
|
|
|
# ── Custom Models ───────────────────────────────────────────────────────────
|
feat: add custom weights, model registry, per-agent models, and reward scoring
Inspired by OpenClaw-RL's multi-model orchestration, this adds four
features for custom model management:
1. Custom model registry (infrastructure/models/registry.py) — SQLite-backed
registry for GGUF, safetensors, HF checkpoint, and Ollama models with
role-based lookups (general, reward, teacher, judge).
2. Per-agent model assignment — each swarm persona can use a different model
instead of sharing the global default. Resolved via registry assignment >
persona default > global default.
3. Runtime model management API (/api/v1/models) — REST endpoints to register,
list, assign, enable/disable, and remove custom models without restart.
Includes a dashboard page at /models.
4. Reward model scoring (PRM-style) — majority-vote quality evaluation of
agent outputs using a configurable reward model. Scores persist in SQLite
and feed into the swarm learner.
New config settings: custom_weights_dir, reward_model_enabled,
reward_model_name, reward_model_votes.
54 new tests covering registry CRUD, API endpoints, agent assignments,
role lookups, and reward scoring.
https://claude.ai/code/session_01V4iTozMwcE2gjfnCJdCugC
2026-02-27 01:08:03 +00:00
|
|
|
|
# Register custom model weights for per-agent assignment.
|
|
|
|
|
|
# Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs.
|
|
|
|
|
|
# Models can also be registered at runtime via the /api/v1/models API.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Roles: general (default inference), reward (PRM scoring),
|
|
|
|
|
|
# teacher (distillation), judge (output evaluation)
|
|
|
|
|
|
custom_models: []
|
|
|
|
|
|
# Example entries:
|
|
|
|
|
|
# - name: my-finetuned-llama
|
|
|
|
|
|
# format: gguf
|
|
|
|
|
|
# path: /path/to/model.gguf
|
|
|
|
|
|
# role: general
|
|
|
|
|
|
# context_window: 8192
|
|
|
|
|
|
# description: "Fine-tuned Llama for code tasks"
|
|
|
|
|
|
#
|
|
|
|
|
|
# - name: reward-model
|
|
|
|
|
|
# format: ollama
|
|
|
|
|
|
# path: deepseek-r1:1.5b
|
|
|
|
|
|
# role: reward
|
|
|
|
|
|
# context_window: 32000
|
|
|
|
|
|
# description: "Process reward model for scoring outputs"
|
|
|
|
|
|
|
2026-02-26 22:29:44 -05:00
|
|
|
|
# ── Agent Model Assignments ─────────────────────────────────────────────────
|
feat: add custom weights, model registry, per-agent models, and reward scoring
Inspired by OpenClaw-RL's multi-model orchestration, this adds four
features for custom model management:
1. Custom model registry (infrastructure/models/registry.py) — SQLite-backed
registry for GGUF, safetensors, HF checkpoint, and Ollama models with
role-based lookups (general, reward, teacher, judge).
2. Per-agent model assignment — each swarm persona can use a different model
instead of sharing the global default. Resolved via registry assignment >
persona default > global default.
3. Runtime model management API (/api/v1/models) — REST endpoints to register,
list, assign, enable/disable, and remove custom models without restart.
Includes a dashboard page at /models.
4. Reward model scoring (PRM-style) — majority-vote quality evaluation of
agent outputs using a configurable reward model. Scores persist in SQLite
and feed into the swarm learner.
New config settings: custom_weights_dir, reward_model_enabled,
reward_model_name, reward_model_votes.
54 new tests covering registry CRUD, API endpoints, agent assignments,
role lookups, and reward scoring.
https://claude.ai/code/session_01V4iTozMwcE2gjfnCJdCugC
2026-02-27 01:08:03 +00:00
|
|
|
|
# Map persona agent IDs to specific models.
|
|
|
|
|
|
# Agents without an assignment use the global default (ollama_model).
|
|
|
|
|
|
agent_model_assignments: {}
|
|
|
|
|
|
# Example:
|
|
|
|
|
|
# persona-forge: my-finetuned-llama
|
|
|
|
|
|
# persona-echo: deepseek-r1:1.5b
|
|
|
|
|
|
|
2026-02-26 22:29:44 -05:00
|
|
|
|
# ── Multi-Modal Settings ────────────────────────────────────────────────────
|
|
|
|
|
|
multimodal:
|
|
|
|
|
|
# Automatically pull models when needed
|
|
|
|
|
|
auto_pull: true
|
|
|
|
|
|
|
|
|
|
|
|
# Timeout for model pulling (seconds)
|
|
|
|
|
|
pull_timeout: 300
|
|
|
|
|
|
|
|
|
|
|
|
# Maximum fallback depth (how many models to try before giving up)
|
|
|
|
|
|
max_fallback_depth: 3
|
|
|
|
|
|
|
|
|
|
|
|
# Prefer smaller models for vision when available (faster)
|
|
|
|
|
|
prefer_small_vision: true
|
|
|
|
|
|
|
2026-02-25 19:43:43 -05:00
|
|
|
|
# Cost tracking (optional, for budget monitoring)
|
|
|
|
|
|
cost_tracking:
|
|
|
|
|
|
enabled: true
|
|
|
|
|
|
budget_daily_usd: 10.0 # Alert if daily spend exceeds this
|
|
|
|
|
|
alert_threshold_percent: 80 # Alert at 80% of budget
|
|
|
|
|
|
|
|
|
|
|
|
# Metrics retention
|
|
|
|
|
|
metrics:
|
|
|
|
|
|
retention_hours: 168 # Keep 7 days of metrics
|
|
|
|
|
|
purge_interval_hours: 24
|