Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
- Defines fleet inventory, fallback chains, and provider health matrix - Documents known issues per wizard (Allegro host unknown, Ezra timeouts) - Includes resurrection protocol and watchdog configuration - Foundation for automated Lazarus Pit operations (#911) Co-authored-by: Bezalel <bezalel@timmy.foundation>
152 lines
3.7 KiB
YAML
152 lines
3.7 KiB
YAML
# Lazarus Pit Registry — Single Source of Truth for Fleet Health and Resurrection
|
|
# Version: 1.0.0
|
|
# Owner: Bezalel (deployment), Ezra (compilation), Allegro (validation)
|
|
|
|
meta:
|
|
version: "1.0.0"
|
|
updated_at: "2026-04-07T02:55:00Z"
|
|
next_review: "2026-04-14T02:55:00Z"
|
|
|
|
fleet:
|
|
bezalel:
|
|
role: forge-and-testbed wizard
|
|
host: 104.131.15.18
|
|
vps_provider: digitalocean
|
|
primary:
|
|
provider: kimi-coding
|
|
model: kimi-k2.5
|
|
fallback_chain:
|
|
- provider: kimi-coding
|
|
model: kimi-k2.5
|
|
timeout: 120
|
|
- provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
timeout: 120
|
|
- provider: openrouter
|
|
model: anthropic/claude-sonnet-4-20250514
|
|
timeout: 120
|
|
- provider: big_brain
|
|
model: gemma3:27b-instruct-q8_0
|
|
timeout: 300
|
|
health_endpoints:
|
|
gateway: "http://127.0.0.1:8646"
|
|
api_server: "http://127.0.0.1:8656"
|
|
auto_restart: true
|
|
|
|
allegro:
|
|
role: code-craft wizard
|
|
host: UNKNOWN
|
|
vps_provider: UNKNOWN
|
|
primary:
|
|
provider: kimi-coding
|
|
model: kimi-k2.5
|
|
fallback_chain:
|
|
- provider: kimi-coding
|
|
model: kimi-k2.5
|
|
timeout: 120
|
|
- provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
timeout: 120
|
|
- provider: openrouter
|
|
model: anthropic/claude-sonnet-4-20250514
|
|
timeout: 120
|
|
health_endpoints:
|
|
gateway: "http://127.0.0.1:8645"
|
|
auto_restart: true
|
|
known_issues:
|
|
- host_and_vps_unknown_to_fleet
|
|
- config_needs_runtime_refresh
|
|
|
|
ezra:
|
|
role: archivist-and-interpreter wizard
|
|
host: UNKNOWN
|
|
vps_provider: UNKNOWN
|
|
primary:
|
|
provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
fallback_chain:
|
|
- provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
timeout: 120
|
|
- provider: openrouter
|
|
model: anthropic/claude-sonnet-4-20250514
|
|
timeout: 120
|
|
auto_restart: true
|
|
known_issues:
|
|
- timeout_choking_on_long_operations
|
|
|
|
timmy:
|
|
role: sovereign core
|
|
host: UNKNOWN
|
|
vps_provider: UNKNOWN
|
|
primary:
|
|
provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
fallback_chain:
|
|
- provider: anthropic
|
|
model: claude-sonnet-4-20250514
|
|
timeout: 120
|
|
- provider: openrouter
|
|
model: anthropic/claude-sonnet-4-20250514
|
|
timeout: 120
|
|
auto_restart: true
|
|
|
|
provider_health_matrix:
|
|
kimi-coding:
|
|
status: degraded
|
|
note: "kimi-for-coding returns 403 access-terminated; use kimi-k2.5 model only"
|
|
last_checked: "2026-04-07T02:55:00Z"
|
|
rate_limited: false
|
|
dead: false
|
|
|
|
anthropic:
|
|
status: healthy
|
|
last_checked: "2026-04-07T02:55:00Z"
|
|
rate_limited: false
|
|
dead: false
|
|
|
|
openrouter:
|
|
status: healthy
|
|
last_checked: "2026-04-07T02:55:00Z"
|
|
rate_limited: false
|
|
dead: false
|
|
|
|
big_brain:
|
|
status: provisioning
|
|
note: "RunPod L40S instance big-brain-bezalel deployed; Ollama endpoint propagating"
|
|
last_checked: "2026-04-07T02:55:00Z"
|
|
endpoint: "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
|
|
rate_limited: false
|
|
dead: false
|
|
|
|
timeout_policies:
|
|
gateway:
|
|
inactivity_timeout_seconds: 600
|
|
diagnostic_on_timeout: true
|
|
cron:
|
|
inactivity_timeout_seconds: 0 # unlimited while active
|
|
agent:
|
|
default_turn_timeout: 120
|
|
long_operation_heartbeat: true
|
|
|
|
watchdog:
|
|
enabled: true
|
|
interval_seconds: 60
|
|
actions:
|
|
- ping_agent_gateways
|
|
- probe_providers
|
|
- parse_agent_logs
|
|
- update_registry
|
|
- auto_promote_fallbacks
|
|
- auto_restart_dead_agents
|
|
|
|
resurrection_protocol:
|
|
soft:
|
|
- reload_config_from_registry
|
|
- rewrite_fallback_providers
|
|
- promote_first_healthy_fallback
|
|
hard:
|
|
- systemctl_restart_gateway
|
|
- log_incident
|
|
- notify_sovereign
|