Files
the-nexus/lazarus-registry.yaml
Timmy Time ac3ab8075d
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
feat(lazarus): Add v1.0.0 fleet health and resurrection registry
- Defines fleet inventory, fallback chains, and provider health matrix
- Documents known issues per wizard (Allegro host unknown, Ezra timeouts)
- Includes resurrection protocol and watchdog configuration
- Foundation for automated Lazarus Pit operations (#911)

Co-authored-by: Bezalel <bezalel@timmy.foundation>
2026-04-07 02:58:22 +00:00

152 lines
3.7 KiB
YAML

# Lazarus Pit Registry — Single Source of Truth for Fleet Health and Resurrection
# Version: 1.0.0
# Owner: Bezalel (deployment), Ezra (compilation), Allegro (validation)
meta:
version: "1.0.0"
updated_at: "2026-04-07T02:55:00Z"
next_review: "2026-04-14T02:55:00Z"
fleet:
bezalel:
role: forge-and-testbed wizard
host: 104.131.15.18
vps_provider: digitalocean
primary:
provider: kimi-coding
model: kimi-k2.5
fallback_chain:
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: big_brain
model: gemma3:27b-instruct-q8_0
timeout: 300
health_endpoints:
gateway: "http://127.0.0.1:8646"
api_server: "http://127.0.0.1:8656"
auto_restart: true
allegro:
role: code-craft wizard
host: UNKNOWN
vps_provider: UNKNOWN
primary:
provider: kimi-coding
model: kimi-k2.5
fallback_chain:
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
health_endpoints:
gateway: "http://127.0.0.1:8645"
auto_restart: true
known_issues:
- host_and_vps_unknown_to_fleet
- config_needs_runtime_refresh
ezra:
role: archivist-and-interpreter wizard
host: UNKNOWN
vps_provider: UNKNOWN
primary:
provider: anthropic
model: claude-sonnet-4-20250514
fallback_chain:
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
auto_restart: true
known_issues:
- timeout_choking_on_long_operations
timmy:
role: sovereign core
host: UNKNOWN
vps_provider: UNKNOWN
primary:
provider: anthropic
model: claude-sonnet-4-20250514
fallback_chain:
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
auto_restart: true
provider_health_matrix:
kimi-coding:
status: degraded
note: "kimi-for-coding returns 403 access-terminated; use kimi-k2.5 model only"
last_checked: "2026-04-07T02:55:00Z"
rate_limited: false
dead: false
anthropic:
status: healthy
last_checked: "2026-04-07T02:55:00Z"
rate_limited: false
dead: false
openrouter:
status: healthy
last_checked: "2026-04-07T02:55:00Z"
rate_limited: false
dead: false
big_brain:
status: provisioning
note: "RunPod L40S instance big-brain-bezalel deployed; Ollama endpoint propagating"
last_checked: "2026-04-07T02:55:00Z"
endpoint: "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
rate_limited: false
dead: false
timeout_policies:
gateway:
inactivity_timeout_seconds: 600
diagnostic_on_timeout: true
cron:
inactivity_timeout_seconds: 0 # unlimited while active
agent:
default_turn_timeout: 120
long_operation_heartbeat: true
watchdog:
enabled: true
interval_seconds: 60
actions:
- ping_agent_gateways
- probe_providers
- parse_agent_logs
- update_registry
- auto_promote_fallbacks
- auto_restart_dead_agents
resurrection_protocol:
soft:
- reload_config_from_registry
- rewrite_fallback_providers
- promote_first_healthy_fallback
hard:
- systemctl_restart_gateway
- log_incident
- notify_sovereign