config/providers.yaml

# Cascade LLM Router Configuration
# Providers are tried in priority order (1 = highest)
# On failure, automatically falls back to next provider

cascade:
  # Timeout settings
  timeout_seconds: 30
  
  # Retry settings
  max_retries_per_provider: 2
  retry_delay_seconds: 1
  
  # Circuit breaker settings
  circuit_breaker:
    failure_threshold: 5        # Open circuit after 5 failures
    recovery_timeout: 60        # Try again after 60 seconds
    half_open_max_calls: 2      # Allow 2 test calls when half-open

providers:
  # Primary: Local Ollama (always try first for sovereignty)
  - name: ollama-local
    type: ollama
    enabled: true
    priority: 1
    url: "http://localhost:11434"
    models:
      - name: llama3.2
        default: true
        context_window: 128000
      - name: deepseek-r1:1.5b
        context_window: 32000
    
  # Secondary: Local AirLLM (if installed)
  - name: airllm-local
    type: airllm
    enabled: false  # Enable if pip install airllm
    priority: 2
    models:
      - name: 70b
        default: true
      - name: 8b
      - name: 405b
    
  # Tertiary: OpenAI (if API key available)
  - name: openai-backup
    type: openai
    enabled: false  # Enable by setting OPENAI_API_KEY
    priority: 3
    api_key: "${OPENAI_API_KEY}"  # Loaded from environment
    base_url: null  # Use default OpenAI endpoint
    models:
      - name: gpt-4o-mini
        default: true
        context_window: 128000
      - name: gpt-4o
        context_window: 128000
    
  # Quaternary: Anthropic (if API key available)
  - name: anthropic-backup
    type: anthropic
    enabled: false  # Enable by setting ANTHROPIC_API_KEY
    priority: 4
    api_key: "${ANTHROPIC_API_KEY}"
    models:
      - name: claude-3-haiku-20240307
        default: true
        context_window: 200000
      - name: claude-3-sonnet-20240229
        context_window: 200000

# ── Custom Models ──────────────────────────────────────────────────────
# Register custom model weights for per-agent assignment.
# Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs.
# Models can also be registered at runtime via the /api/v1/models API.
#
# Roles: general (default inference), reward (PRM scoring),
#        teacher (distillation), judge (output evaluation)
custom_models: []
  # Example entries:
  # - name: my-finetuned-llama
  #   format: gguf
  #   path: /path/to/model.gguf
  #   role: general
  #   context_window: 8192
  #   description: "Fine-tuned Llama for code tasks"
  #
  # - name: reward-model
  #   format: ollama
  #   path: deepseek-r1:1.5b
  #   role: reward
  #   context_window: 32000
  #   description: "Process reward model for scoring outputs"

# ── Agent Model Assignments ─────────────────────────────────────────────
# Map persona agent IDs to specific models.
# Agents without an assignment use the global default (ollama_model).
agent_model_assignments: {}
  # Example:
  # persona-forge: my-finetuned-llama
  # persona-echo: deepseek-r1:1.5b

# Cost tracking (optional, for budget monitoring)
cost_tracking:
  enabled: true
  budget_daily_usd: 10.0  # Alert if daily spend exceeds this
  alert_threshold_percent: 80  # Alert at 80% of budget

# Metrics retention
metrics:
  retention_hours: 168  # Keep 7 days of metrics
  purge_interval_hours: 24
Phase 3: Cascade LLM Router with automatic failover - YAML-based provider configuration (config/providers.yaml) - Priority-ordered provider routing - Circuit breaker pattern for failing providers - Health check and availability monitoring - Metrics tracking (latency, errors, success rates) - Support for Ollama, OpenAI, Anthropic, AirLLM providers - Automatic failover on rate limits or errors - REST API endpoints for monitoring and control - 41 comprehensive tests API Endpoints: - POST /api/v1/router/complete - Chat completion with failover - GET /api/v1/router/status - Provider health status - GET /api/v1/router/metrics - Detailed metrics - GET /api/v1/router/providers - List all providers - POST /api/v1/router/providers/{name}/control - Enable/disable/reset - POST /api/v1/router/health-check - Run health checks - GET /api/v1/router/config - View configuration 2026-02-25 19:43:43 -05:00			`# Cascade LLM Router Configuration`
			`# Providers are tried in priority order (1 = highest)`
			`# On failure, automatically falls back to next provider`

			`cascade:`
			`# Timeout settings`
			`timeout_seconds: 30`

			`# Retry settings`
			`max_retries_per_provider: 2`
			`retry_delay_seconds: 1`

			`# Circuit breaker settings`
			`circuit_breaker:`
			`failure_threshold: 5 # Open circuit after 5 failures`
			`recovery_timeout: 60 # Try again after 60 seconds`
			`half_open_max_calls: 2 # Allow 2 test calls when half-open`

			`providers:`
			`# Primary: Local Ollama (always try first for sovereignty)`
			`- name: ollama-local`
			`type: ollama`
			`enabled: true`
			`priority: 1`
			`url: "http://localhost:11434"`
			`models:`
			`- name: llama3.2`
			`default: true`
			`context_window: 128000`
			`- name: deepseek-r1:1.5b`
			`context_window: 32000`

			`# Secondary: Local AirLLM (if installed)`
			`- name: airllm-local`
			`type: airllm`
			`enabled: false # Enable if pip install airllm`
			`priority: 2`
			`models:`
			`- name: 70b`
			`default: true`
			`- name: 8b`
			`- name: 405b`

			`# Tertiary: OpenAI (if API key available)`
			`- name: openai-backup`
			`type: openai`
			`enabled: false # Enable by setting OPENAI_API_KEY`
			`priority: 3`
			`api_key: "${OPENAI_API_KEY}" # Loaded from environment`
			`base_url: null # Use default OpenAI endpoint`
			`models:`
			`- name: gpt-4o-mini`
			`default: true`
			`context_window: 128000`
			`- name: gpt-4o`
			`context_window: 128000`

			`# Quaternary: Anthropic (if API key available)`
			`- name: anthropic-backup`
			`type: anthropic`
			`enabled: false # Enable by setting ANTHROPIC_API_KEY`
			`priority: 4`
			`api_key: "${ANTHROPIC_API_KEY}"`
			`models:`
			`- name: claude-3-haiku-20240307`
			`default: true`
			`context_window: 200000`
			`- name: claude-3-sonnet-20240229`
			`context_window: 200000`

feat: add custom weights, model registry, per-agent models, and reward scoring Inspired by OpenClaw-RL's multi-model orchestration, this adds four features for custom model management: 1. Custom model registry (infrastructure/models/registry.py) — SQLite-backed registry for GGUF, safetensors, HF checkpoint, and Ollama models with role-based lookups (general, reward, teacher, judge). 2. Per-agent model assignment — each swarm persona can use a different model instead of sharing the global default. Resolved via registry assignment > persona default > global default. 3. Runtime model management API (/api/v1/models) — REST endpoints to register, list, assign, enable/disable, and remove custom models without restart. Includes a dashboard page at /models. 4. Reward model scoring (PRM-style) — majority-vote quality evaluation of agent outputs using a configurable reward model. Scores persist in SQLite and feed into the swarm learner. New config settings: custom_weights_dir, reward_model_enabled, reward_model_name, reward_model_votes. 54 new tests covering registry CRUD, API endpoints, agent assignments, role lookups, and reward scoring. https://claude.ai/code/session_01V4iTozMwcE2gjfnCJdCugC 2026-02-27 01:08:03 +00:00			`# ── Custom Models ──────────────────────────────────────────────────────`
			`# Register custom model weights for per-agent assignment.`
			`# Supports GGUF (Ollama), safetensors, and HuggingFace checkpoint dirs.`
			`# Models can also be registered at runtime via the /api/v1/models API.`
			`#`
			`# Roles: general (default inference), reward (PRM scoring),`
			`# teacher (distillation), judge (output evaluation)`
			`custom_models: []`
			`# Example entries:`
			`# - name: my-finetuned-llama`
			`# format: gguf`
			`# path: /path/to/model.gguf`
			`# role: general`
			`# context_window: 8192`
			`# description: "Fine-tuned Llama for code tasks"`
			`#`
			`# - name: reward-model`
			`# format: ollama`
			`# path: deepseek-r1:1.5b`
			`# role: reward`
			`# context_window: 32000`
			`# description: "Process reward model for scoring outputs"`

			`# ── Agent Model Assignments ─────────────────────────────────────────────`
			`# Map persona agent IDs to specific models.`
			`# Agents without an assignment use the global default (ollama_model).`
			`agent_model_assignments: {}`
			`# Example:`
			`# persona-forge: my-finetuned-llama`
			`# persona-echo: deepseek-r1:1.5b`

Phase 3: Cascade LLM Router with automatic failover - YAML-based provider configuration (config/providers.yaml) - Priority-ordered provider routing - Circuit breaker pattern for failing providers - Health check and availability monitoring - Metrics tracking (latency, errors, success rates) - Support for Ollama, OpenAI, Anthropic, AirLLM providers - Automatic failover on rate limits or errors - REST API endpoints for monitoring and control - 41 comprehensive tests API Endpoints: - POST /api/v1/router/complete - Chat completion with failover - GET /api/v1/router/status - Provider health status - GET /api/v1/router/metrics - Detailed metrics - GET /api/v1/router/providers - List all providers - POST /api/v1/router/providers/{name}/control - Enable/disable/reset - POST /api/v1/router/health-check - Run health checks - GET /api/v1/router/config - View configuration 2026-02-25 19:43:43 -05:00			`# Cost tracking (optional, for budget monitoring)`
			`cost_tracking:`
			`enabled: true`
			`budget_daily_usd: 10.0 # Alert if daily spend exceeds this`
			`alert_threshold_percent: 80 # Alert at 80% of budget`

			`# Metrics retention`
			`metrics:`
			`retention_hours: 168 # Keep 7 days of metrics`
			`purge_interval_hours: 24`