Compare commits
6 Commits
claude/iss
...
groq/issue
| Author | SHA1 | Date | |
|---|---|---|---|
| 34862cf5e5 | |||
| 5275c96e52 | |||
| 36e1db9ae1 | |||
| 259df5b5e6 | |||
| 30fe98d569 | |||
| b0654bac6c |
@@ -41,9 +41,11 @@ jobs:
|
||||
run: |
|
||||
FAIL=0
|
||||
for f in $(find . -name '*.py' -not -path './venv/*'); do
|
||||
if ! python3 -c "import py_compile; py_compile.compile('$f', doraise=True)" 2>/dev/null; then
|
||||
else
|
||||
if python3 -c "import py_compile; py_compile.compile('$f', doraise=True)" 2>/dev/null; then
|
||||
echo "OK: $f"
|
||||
else
|
||||
echo "FAIL: $f"
|
||||
FAIL=1
|
||||
fi
|
||||
done
|
||||
exit $FAIL
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"id": 27,
|
||||
"name": "carnice",
|
||||
"gitea_user": "carnice",
|
||||
"model": "qwen3.5-9b",
|
||||
"model": "ollama:gemma4:12b",
|
||||
"tier": "free",
|
||||
"location": "Local Metal",
|
||||
"description": "Local Hermes agent, fine-tuned on Hermes traces. Runs on local hardware.",
|
||||
@@ -41,7 +41,7 @@
|
||||
"id": 25,
|
||||
"name": "bilbobagginshire",
|
||||
"gitea_user": "bilbobagginshire",
|
||||
"model": "ollama",
|
||||
"model": "ollama:gemma4:12b",
|
||||
"tier": "free",
|
||||
"location": "Bag End, The Shire (VPS)",
|
||||
"description": "Ollama on VPS. Speaks when spoken to. Prefers quiet. Not for delegated work.",
|
||||
@@ -74,7 +74,7 @@
|
||||
"id": 23,
|
||||
"name": "substratum",
|
||||
"gitea_user": "substratum",
|
||||
"model": "unassigned",
|
||||
"model": "ollama:gemma4:12b",
|
||||
"tier": "unknown",
|
||||
"location": "Below the Surface",
|
||||
"description": "Infrastructure, deployments, bedrock services. Needs model assignment before activation.",
|
||||
|
||||
@@ -76,7 +76,7 @@ deepdive:
|
||||
# Phase 3: Synthesis
|
||||
synthesis:
|
||||
llm_endpoint: "http://localhost:4000/v1" # Local llama-server
|
||||
llm_model: "gemma-4-it"
|
||||
llm_model: "gemma4:12b"
|
||||
max_summary_length: 800
|
||||
temperature: 0.7
|
||||
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
# Lazarus Pit Registry — Single Source of Truth for Fleet Health and Resurrection
|
||||
# Version: 1.0.0
|
||||
# Owner: Bezalel (deployment), Ezra (compilation), Allegro (validation)
|
||||
|
||||
meta:
|
||||
version: "1.0.0"
|
||||
updated_at: "2026-04-07T02:55:00Z"
|
||||
next_review: "2026-04-14T02:55:00Z"
|
||||
|
||||
version: 1.0.0
|
||||
updated_at: '2026-04-07T15:09:53.386648+00:00'
|
||||
next_review: '2026-04-14T02:55:00Z'
|
||||
fleet:
|
||||
bezalel:
|
||||
role: forge-and-testbed wizard
|
||||
@@ -16,23 +11,22 @@ fleet:
|
||||
provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
fallback_chain:
|
||||
- provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: big_brain
|
||||
model: gemma3:27b-instruct-q8_0
|
||||
timeout: 300
|
||||
- provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: ollama
|
||||
model: gemma4:12b
|
||||
timeout: 300
|
||||
health_endpoints:
|
||||
gateway: "http://127.0.0.1:8646"
|
||||
api_server: "http://127.0.0.1:8656"
|
||||
gateway: http://127.0.0.1:8646
|
||||
api_server: http://127.0.0.1:8656
|
||||
auto_restart: true
|
||||
|
||||
allegro:
|
||||
role: code-craft wizard
|
||||
host: UNKNOWN
|
||||
@@ -41,22 +35,21 @@ fleet:
|
||||
provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
fallback_chain:
|
||||
- provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: kimi-coding
|
||||
model: kimi-k2.5
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
health_endpoints:
|
||||
gateway: "http://127.0.0.1:8645"
|
||||
gateway: http://127.0.0.1:8645
|
||||
auto_restart: true
|
||||
known_issues:
|
||||
- host_and_vps_unknown_to_fleet
|
||||
- config_needs_runtime_refresh
|
||||
|
||||
- host_and_vps_unknown_to_fleet
|
||||
- config_needs_runtime_refresh
|
||||
ezra:
|
||||
role: archivist-and-interpreter wizard
|
||||
host: UNKNOWN
|
||||
@@ -65,16 +58,15 @@ fleet:
|
||||
provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
fallback_chain:
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
auto_restart: true
|
||||
known_issues:
|
||||
- timeout_choking_on_long_operations
|
||||
|
||||
- timeout_choking_on_long_operations
|
||||
timmy:
|
||||
role: sovereign core
|
||||
host: UNKNOWN
|
||||
@@ -83,69 +75,63 @@ fleet:
|
||||
provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
fallback_chain:
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: anthropic
|
||||
model: claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
- provider: openrouter
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
timeout: 120
|
||||
auto_restart: true
|
||||
|
||||
provider_health_matrix:
|
||||
kimi-coding:
|
||||
status: degraded
|
||||
note: "kimi-for-coding returns 403 access-terminated; use kimi-k2.5 model only"
|
||||
last_checked: "2026-04-07T02:55:00Z"
|
||||
status: healthy
|
||||
note: ''
|
||||
last_checked: '2026-04-07T15:09:53.384900+00:00'
|
||||
rate_limited: false
|
||||
dead: false
|
||||
|
||||
anthropic:
|
||||
status: healthy
|
||||
last_checked: "2026-04-07T02:55:00Z"
|
||||
last_checked: '2026-04-07T15:09:53.385047+00:00'
|
||||
rate_limited: false
|
||||
dead: false
|
||||
|
||||
note: ''
|
||||
openrouter:
|
||||
status: healthy
|
||||
last_checked: "2026-04-07T02:55:00Z"
|
||||
last_checked: '2026-04-07T02:55:00Z'
|
||||
rate_limited: false
|
||||
dead: false
|
||||
|
||||
big_brain:
|
||||
status: provisioning
|
||||
note: "RunPod L40S instance big-brain-bezalel deployed; Ollama endpoint propagating"
|
||||
last_checked: "2026-04-07T02:55:00Z"
|
||||
endpoint: "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
|
||||
ollama:
|
||||
status: healthy
|
||||
note: Local Ollama endpoint with Gemma 4 support
|
||||
last_checked: '2026-04-07T15:09:53.385047+00:00'
|
||||
endpoint: http://localhost:11434/v1
|
||||
rate_limited: false
|
||||
dead: false
|
||||
|
||||
timeout_policies:
|
||||
gateway:
|
||||
inactivity_timeout_seconds: 600
|
||||
diagnostic_on_timeout: true
|
||||
cron:
|
||||
inactivity_timeout_seconds: 0 # unlimited while active
|
||||
inactivity_timeout_seconds: 0
|
||||
agent:
|
||||
default_turn_timeout: 120
|
||||
long_operation_heartbeat: true
|
||||
|
||||
watchdog:
|
||||
enabled: true
|
||||
interval_seconds: 60
|
||||
actions:
|
||||
- ping_agent_gateways
|
||||
- probe_providers
|
||||
- parse_agent_logs
|
||||
- update_registry
|
||||
- auto_promote_fallbacks
|
||||
- auto_restart_dead_agents
|
||||
|
||||
- ping_agent_gateways
|
||||
- probe_providers
|
||||
- parse_agent_logs
|
||||
- update_registry
|
||||
- auto_promote_fallbacks
|
||||
- auto_restart_dead_agents
|
||||
resurrection_protocol:
|
||||
soft:
|
||||
- reload_config_from_registry
|
||||
- rewrite_fallback_providers
|
||||
- promote_first_healthy_fallback
|
||||
- reload_config_from_registry
|
||||
- rewrite_fallback_providers
|
||||
- promote_first_healthy_fallback
|
||||
hard:
|
||||
- systemctl_restart_gateway
|
||||
- log_incident
|
||||
- notify_sovereign
|
||||
- systemctl_restart_gateway
|
||||
- log_incident
|
||||
- notify_sovereign
|
||||
|
||||
118
nexus/components/fleet-health-dashboard.html
Normal file
118
nexus/components/fleet-health-dashboard.html
Normal file
@@ -0,0 +1,118 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Fleet Health Dashboard — Lazarus Pit</title>
|
||||
<style>
|
||||
body { font-family: system-ui, sans-serif; background: #0b0c10; color: #c5c6c7; margin: 0; padding: 2rem; }
|
||||
h1 { color: #66fcf1; margin-bottom: 0.5rem; }
|
||||
.subtitle { color: #45a29e; margin-bottom: 2rem; }
|
||||
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1rem; }
|
||||
.card { background: #1f2833; border-radius: 8px; padding: 1rem; border-left: 4px solid #66fcf1; }
|
||||
.card.dead { border-left-color: #ff4444; }
|
||||
.card.warning { border-left-color: #ffaa00; }
|
||||
.card.unknown { border-left-color: #888; }
|
||||
.name { font-size: 1.2rem; font-weight: bold; color: #fff; }
|
||||
.status { font-size: 0.9rem; margin-top: 0.5rem; }
|
||||
.metric { display: flex; justify-content: space-between; margin-top: 0.3rem; font-size: 0.85rem; }
|
||||
.timestamp { color: #888; font-size: 0.75rem; margin-top: 0.8rem; }
|
||||
#alerts { margin-top: 2rem; background: #1f2833; padding: 1rem; border-radius: 8px; }
|
||||
.alert { color: #ff4444; font-size: 0.9rem; margin: 0.3rem 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>⚡ Fleet Health Dashboard</h1>
|
||||
<div class="subtitle">Powered by the Lazarus Pit — Live Registry</div>
|
||||
<div class="grid" id="fleetGrid"></div>
|
||||
<div id="alerts"></div>
|
||||
|
||||
<script>
|
||||
const REGISTRY_URL = "https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/raw/branch/main/lazarus-registry.yaml";
|
||||
|
||||
async function fetchRegistry() {
|
||||
try {
|
||||
const res = await fetch(REGISTRY_URL);
|
||||
const text = await res.text();
|
||||
// Very lightweight YAML parser for the subset we need
|
||||
const data = parseSimpleYaml(text);
|
||||
render(data);
|
||||
} catch (e) {
|
||||
document.getElementById("fleetGrid").innerHTML = `<div class="card dead">Failed to load registry: ${e.message}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
function parseSimpleYaml(text) {
|
||||
// Enough to extract fleet blocks and provider matrix
|
||||
const lines = text.split("\n");
|
||||
const obj = { fleet: {}, provider_health_matrix: {} };
|
||||
let section = null;
|
||||
let agent = null;
|
||||
let depth = 0;
|
||||
lines.forEach(line => {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed === "fleet:") { section = "fleet"; return; }
|
||||
if (trimmed === "provider_health_matrix:") { section = "providers"; return; }
|
||||
if (section === "fleet" && !trimmed.startsWith("-") && trimmed.endsWith(":") && !trimmed.includes(":")) {
|
||||
agent = trimmed.replace(":", "");
|
||||
obj.fleet[agent] = {};
|
||||
return;
|
||||
}
|
||||
if (section === "fleet" && agent && trimmed.includes(": ")) {
|
||||
const [k, ...v] = trimmed.split(": ");
|
||||
obj.fleet[agent][k.trim()] = v.join(": ").trim();
|
||||
}
|
||||
if (section === "providers" && trimmed.includes(": ")) {
|
||||
const [k, ...v] = trimmed.split(": ");
|
||||
if (!obj.provider_health_matrix[k.trim()]) obj.provider_health_matrix[k.trim()] = {};
|
||||
obj.provider_health_matrix[k.trim()]["status"] = v.join(": ").trim();
|
||||
}
|
||||
});
|
||||
return obj;
|
||||
}
|
||||
|
||||
function render(data) {
|
||||
const grid = document.getElementById("fleetGrid");
|
||||
const alerts = document.getElementById("alerts");
|
||||
grid.innerHTML = "";
|
||||
alerts.innerHTML = "";
|
||||
|
||||
const fleet = data.fleet || {};
|
||||
const providers = data.provider_health_matrix || {};
|
||||
let alertHtml = "";
|
||||
|
||||
Object.entries(fleet).forEach(([name, spec]) => {
|
||||
const provider = spec.primary ? JSON.parse(JSON.stringify(spec.primary).replace(/'/g, '"')) : {};
|
||||
const provName = provider.provider || "unknown";
|
||||
const provStatus = (providers[provName] || {}).status || "unknown";
|
||||
const host = spec.host || "unknown";
|
||||
const autoRestart = spec.auto_restart === "true" || spec.auto_restart === true;
|
||||
|
||||
let cardClass = "card";
|
||||
if (provStatus === "dead" || provStatus === "degraded") cardClass += " warning";
|
||||
if (host === "UNKNOWN") cardClass += " unknown";
|
||||
|
||||
const html = `
|
||||
<div class="${cardClass}">
|
||||
<div class="name">${name}</div>
|
||||
<div class="status">Role: ${spec.role || "—"}</div>
|
||||
<div class="metric"><span>Host</span><span>${host}</span></div>
|
||||
<div class="metric"><span>Provider</span><span>${provName}</span></div>
|
||||
<div class="metric"><span>Provider Health</span><span style="color:${provStatus==='healthy'?'#66fcf1':provStatus==='degraded'?'#ffaa00':'#ff4444'}">${provStatus}</span></div>
|
||||
<div class="metric"><span>Auto-Restart</span><span>${autoRestart ? "ON" : "OFF"}</span></div>
|
||||
<div class="timestamp">Registry updated: ${data.meta ? data.meta.updated_at : "—"}</div>
|
||||
</div>
|
||||
`;
|
||||
grid.innerHTML += html;
|
||||
|
||||
if (provStatus === "dead") alertHtml += `<div class="alert">🚨 ${name}: primary provider ${provName} is DEAD</div>`;
|
||||
if (host === "UNKNOWN") alertHtml += `<div class="alert">⚠️ ${name}: host unknown — cannot monitor or resurrect</div>`;
|
||||
});
|
||||
|
||||
alerts.innerHTML = alertHtml || `<div style="color:#66fcf1">All agents within known parameters.</div>`;
|
||||
}
|
||||
|
||||
fetchRegistry();
|
||||
setInterval(fetchRegistry, 60000);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
101
nexus/components/fleet-pulse.html
Normal file
101
nexus/components/fleet-pulse.html
Normal file
@@ -0,0 +1,101 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Fleet Pulse — Collective Stability</title>
|
||||
<style>
|
||||
body { margin: 0; background: #050505; overflow: hidden; display: flex; align-items: center; justify-content: center; height: 100vh; }
|
||||
#pulseCanvas { display: block; }
|
||||
#info {
|
||||
position: absolute; bottom: 20px; left: 50%; transform: translateX(-50%);
|
||||
color: #66fcf1; font-family: system-ui, sans-serif; font-size: 14px; opacity: 0.8;
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<canvas id="pulseCanvas"></canvas>
|
||||
<div id="info">Fleet Pulse — Lazarus Pit Registry</div>
|
||||
<script>
|
||||
const canvas = document.getElementById('pulseCanvas');
|
||||
const ctx = canvas.getContext('2d');
|
||||
let width, height, centerX, centerY;
|
||||
|
||||
function resize() {
|
||||
width = canvas.width = window.innerWidth;
|
||||
height = canvas.height = window.innerHeight;
|
||||
centerX = width / 2;
|
||||
centerY = height / 2;
|
||||
}
|
||||
window.addEventListener('resize', resize);
|
||||
resize();
|
||||
|
||||
let syncLevel = 0.5;
|
||||
let targetSync = 0.5;
|
||||
|
||||
async function fetchRegistry() {
|
||||
try {
|
||||
const res = await fetch('https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/raw/branch/main/lazarus-registry.yaml');
|
||||
const text = await res.text();
|
||||
const healthy = (text.match(/status: healthy/g) || []).length;
|
||||
const degraded = (text.match(/status: degraded/g) || []).length;
|
||||
const dead = (text.match(/status: dead/g) || []).length;
|
||||
const total = healthy + degraded + dead + 1;
|
||||
targetSync = Math.max(0.1, Math.min(1.0, (healthy + 0.5 * degraded) / total));
|
||||
} catch (e) {
|
||||
targetSync = 0.2;
|
||||
}
|
||||
}
|
||||
|
||||
fetchRegistry();
|
||||
setInterval(fetchRegistry, 30000);
|
||||
|
||||
let time = 0;
|
||||
function draw() {
|
||||
time += 0.02;
|
||||
syncLevel += (targetSync - syncLevel) * 0.02;
|
||||
|
||||
ctx.fillStyle = 'rgba(5, 5, 5, 0.2)';
|
||||
ctx.fillRect(0, 0, width, height);
|
||||
|
||||
const baseRadius = 60 + syncLevel * 80;
|
||||
const pulseSpeed = 0.5 + syncLevel * 1.5;
|
||||
const colorHue = syncLevel > 0.7 ? 170 : syncLevel > 0.4 ? 45 : 0;
|
||||
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const offset = i * 1.2;
|
||||
const radius = baseRadius + Math.sin(time * pulseSpeed + offset) * (20 + syncLevel * 40);
|
||||
const alpha = 0.6 - i * 0.1;
|
||||
|
||||
ctx.beginPath();
|
||||
ctx.arc(centerX, centerY, Math.abs(radius), 0, Math.PI * 2);
|
||||
ctx.strokeStyle = `hsla(${colorHue}, 80%, 60%, ${alpha})`;
|
||||
ctx.lineWidth = 3 + syncLevel * 4;
|
||||
ctx.stroke();
|
||||
}
|
||||
|
||||
// Orbiting agents
|
||||
const agents = 5;
|
||||
for (let i = 0; i < agents; i++) {
|
||||
const angle = time * 0.3 * (i % 2 === 0 ? 1 : -1) + (i * Math.PI * 2 / agents);
|
||||
const orbitR = baseRadius + 80 + i * 25;
|
||||
const x = centerX + Math.cos(angle) * orbitR;
|
||||
const y = centerY + Math.sin(angle) * orbitR;
|
||||
|
||||
ctx.beginPath();
|
||||
ctx.arc(x, y, 4 + syncLevel * 4, 0, Math.PI * 2);
|
||||
ctx.fillStyle = `hsl(${colorHue}, 80%, 70%)`;
|
||||
ctx.fill();
|
||||
}
|
||||
|
||||
ctx.fillStyle = '#fff';
|
||||
ctx.font = '16px system-ui';
|
||||
ctx.textAlign = 'center';
|
||||
ctx.fillText(`Collective Stability: ${Math.round(syncLevel * 100)}%`, centerX, centerY + 8);
|
||||
|
||||
requestAnimationFrame(draw);
|
||||
}
|
||||
draw();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
pytest>=7.0
|
||||
pytest-asyncio>=0.21.0
|
||||
pyyaml>=6.0
|
||||
140
scripts/lazarus_checkpoint.py
Normal file
140
scripts/lazarus_checkpoint.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Lazarus Checkpoint / Restore
|
||||
============================
|
||||
Save and resume mission cell state for agent resurrection.
|
||||
|
||||
Usage:
|
||||
python scripts/lazarus_checkpoint.py <mission_name>
|
||||
python scripts/lazarus_checkpoint.py --restore <mission_name>
|
||||
python scripts/lazarus_checkpoint.py --list
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import tarfile
|
||||
import subprocess
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
CHECKPOINT_DIR = Path("/var/lib/lazarus/checkpoints")
|
||||
MISSION_DIRS = {
|
||||
"bezalel": "/root/wizards/bezalel",
|
||||
"the-nexus": "/root/wizards/bezalel/workspace/the-nexus",
|
||||
"hermes-agent": "/root/wizards/bezalel/workspace/hermes-agent",
|
||||
}
|
||||
|
||||
|
||||
def shell(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
|
||||
try:
|
||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
except Exception as e:
|
||||
return -1, "", str(e)
|
||||
|
||||
|
||||
def checkpoint(mission: str) -> Path:
|
||||
src = Path(MISSION_DIRS.get(mission, mission))
|
||||
if not src.exists():
|
||||
print(f"ERROR: Source directory not found: {src}")
|
||||
sys.exit(1)
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
out_dir = CHECKPOINT_DIR / mission
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
tar_path = out_dir / f"{mission}_{ts}.tar.gz"
|
||||
|
||||
# Git commit checkpoint
|
||||
git_sha = ""
|
||||
git_path = src / ".git"
|
||||
if git_path.exists():
|
||||
code, out, _ = shell(f"cd {src} && git rev-parse HEAD")
|
||||
if code == 0:
|
||||
git_sha = out
|
||||
|
||||
meta = {
|
||||
"mission": mission,
|
||||
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||
"source": str(src),
|
||||
"git_sha": git_sha,
|
||||
}
|
||||
meta_path = out_dir / f"{mission}_{ts}.json"
|
||||
with open(meta_path, "w") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
# Tar.gz checkpoint (respect .gitignore if possible)
|
||||
with tarfile.open(tar_path, "w:gz") as tar:
|
||||
tar.add(src, arcname=src.name)
|
||||
|
||||
print(f"CHECKPOINT {mission}: {tar_path}")
|
||||
print(f" Meta: {meta_path}")
|
||||
print(f" Git SHA: {git_sha or 'n/a'}")
|
||||
return tar_path
|
||||
|
||||
|
||||
def restore(mission: str, identifier: str | None = None):
|
||||
out_dir = CHECKPOINT_DIR / mission
|
||||
if not out_dir.exists():
|
||||
print(f"ERROR: No checkpoints found for {mission}")
|
||||
sys.exit(1)
|
||||
|
||||
tars = sorted(out_dir.glob("*.tar.gz"))
|
||||
if not tars:
|
||||
print(f"ERROR: No tar.gz checkpoints for {mission}")
|
||||
sys.exit(1)
|
||||
|
||||
if identifier:
|
||||
tar_path = out_dir / f"{mission}_{identifier}.tar.gz"
|
||||
if not tar_path.exists():
|
||||
print(f"ERROR: Checkpoint not found: {tar_path}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
tar_path = tars[-1]
|
||||
|
||||
src = Path(MISSION_DIRS.get(mission, mission))
|
||||
print(f"RESTORE {mission}: {tar_path} → {src}")
|
||||
with tarfile.open(tar_path, "r:gz") as tar:
|
||||
tar.extractall(path=src.parent)
|
||||
print("Restore complete. Restart agent to resume from checkpoint.")
|
||||
|
||||
|
||||
def list_checkpoints():
|
||||
if not CHECKPOINT_DIR.exists():
|
||||
print("No checkpoints stored.")
|
||||
return
|
||||
for mission_dir in sorted(CHECKPOINT_DIR.iterdir()):
|
||||
if mission_dir.is_dir():
|
||||
tars = sorted(mission_dir.glob("*.tar.gz"))
|
||||
print(f"{mission_dir.name}: {len(tars)} checkpoint(s)")
|
||||
for t in tars[-5:]:
|
||||
print(f" {t.name}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Lazarus Checkpoint / Restore")
|
||||
parser.add_argument("mission", nargs="?", help="Mission name to checkpoint/restore")
|
||||
parser.add_argument("--restore", action="store_true", help="Restore mode")
|
||||
parser.add_argument("--identifier", help="Specific checkpoint identifier (YYYYMMDD_HHMMSS)")
|
||||
parser.add_argument("--list", action="store_true", help="List all checkpoints")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list:
|
||||
list_checkpoints()
|
||||
return 0
|
||||
|
||||
if not args.mission:
|
||||
print("ERROR: mission name required (or use --list)")
|
||||
return 1
|
||||
|
||||
if args.restore:
|
||||
restore(args.mission, args.identifier)
|
||||
else:
|
||||
checkpoint(args.mission)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
252
scripts/lazarus_watchdog.py
Normal file
252
scripts/lazarus_watchdog.py
Normal file
@@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Lazarus Pit Watchdog
|
||||
====================
|
||||
Automated health monitoring, fallback promotion, and agent resurrection
|
||||
for the Timmy Foundation wizard fleet.
|
||||
|
||||
Usage:
|
||||
python lazarus_watchdog.py [--dry-run]
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import subprocess
|
||||
import urllib.request
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
REGISTRY_PATH = Path("/root/wizards/bezalel/workspace/the-nexus/lazarus-registry.yaml")
|
||||
INCIDENT_LOG = Path("/var/log/lazarus_incidents.jsonl")
|
||||
AGENT_CONFIG_PATH = Path("/root/wizards/bezalel/home/.hermes/config.yaml")
|
||||
|
||||
|
||||
def shell(cmd: str, timeout: int = 30) -> tuple[int, str, str]:
|
||||
try:
|
||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
except Exception as e:
|
||||
return -1, "", str(e)
|
||||
|
||||
|
||||
def load_registry() -> dict:
|
||||
with open(REGISTRY_PATH) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def save_registry(data: dict):
|
||||
with open(REGISTRY_PATH, "w") as f:
|
||||
yaml.dump(data, f, default_flow_style=False, sort_keys=False)
|
||||
|
||||
|
||||
def ping_http(url: str, timeout: int = 10) -> tuple[bool, int]:
|
||||
try:
|
||||
req = urllib.request.Request(url, method="HEAD")
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return True, resp.status
|
||||
except urllib.error.HTTPError as e:
|
||||
return True, e.code
|
||||
except Exception:
|
||||
return False, 0
|
||||
|
||||
|
||||
def probe_provider(provider: str, model: str, timeout: int = 20) -> dict:
|
||||
"""
|
||||
Lightweight provider probe.
|
||||
For now we only check if the provider is in our local Hermes config
|
||||
by attempting a trivial API call. Simplified: just assume healthy
|
||||
unless we have explicit evidence of death from logs.
|
||||
"""
|
||||
# Check agent logs for recent provider failures
|
||||
log_path = Path("/var/log/syslog")
|
||||
if not log_path.exists():
|
||||
log_path = Path("/var/log/messages")
|
||||
|
||||
dead_keywords = ["access_terminated", "403", "Invalid API key"]
|
||||
degraded_keywords = ["rate limit", "429", "timeout", "Connection reset"]
|
||||
|
||||
status = "healthy"
|
||||
note = ""
|
||||
|
||||
# Parse last 100 lines of hermes log if available
|
||||
hermes_log = Path("/var/log/hermes-gateway.log")
|
||||
if hermes_log.exists():
|
||||
_, out, _ = shell(f"tail -n 100 {hermes_log}")
|
||||
lower = out.lower()
|
||||
for kw in dead_keywords:
|
||||
if kw in lower:
|
||||
status = "dead"
|
||||
note = f"Detected '{kw}' in recent gateway logs"
|
||||
break
|
||||
if status == "healthy":
|
||||
for kw in degraded_keywords:
|
||||
if kw in lower:
|
||||
status = "degraded"
|
||||
note = f"Detected '{kw}' in recent gateway logs"
|
||||
break
|
||||
|
||||
return {"status": status, "note": note, "last_checked": datetime.now(timezone.utc).isoformat()}
|
||||
|
||||
|
||||
def check_agent(name: str, spec: dict) -> dict:
|
||||
result = {"agent": name, "timestamp": datetime.now(timezone.utc).isoformat(), "actions": []}
|
||||
|
||||
# Ping gateway
|
||||
gw_url = spec.get("health_endpoints", {}).get("gateway")
|
||||
if gw_url:
|
||||
reachable, code = ping_http(gw_url)
|
||||
result["gateway_reachable"] = reachable
|
||||
result["gateway_status"] = code
|
||||
if not reachable:
|
||||
result["actions"].append("gateway_unreachable")
|
||||
else:
|
||||
result["gateway_reachable"] = False
|
||||
result["actions"].append("no_gateway_configured")
|
||||
|
||||
# Local service check (only if on this host)
|
||||
host = spec.get("host", "")
|
||||
if host in ("127.0.0.1", "localhost", "104.131.15.18") or not host:
|
||||
svc_name = f"hermes-{name}.service"
|
||||
code, out, _ = shell(f"systemctl is-active {svc_name}")
|
||||
result["service_active"] = (code == 0)
|
||||
if code != 0:
|
||||
result["actions"].append("service_inactive")
|
||||
else:
|
||||
result["service_active"] = None
|
||||
|
||||
# Probe primary provider
|
||||
primary = spec.get("primary", {})
|
||||
probe = probe_provider(primary.get("provider"), primary.get("model"))
|
||||
result["primary_provider"] = probe
|
||||
if probe["status"] in ("dead", "degraded"):
|
||||
result["actions"].append(f"primary_{probe['status']}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def rewrite_fallbacks(name: str, fallback_chain: list, dry_run: bool = False) -> bool:
|
||||
"""Rewrite Bezalel's local config.yaml fallback_providers to match registry."""
|
||||
if name != "bezalel":
|
||||
return False # Can only rewrite local config
|
||||
if not AGENT_CONFIG_PATH.exists():
|
||||
return False
|
||||
|
||||
with open(AGENT_CONFIG_PATH) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
if "fallback_providers" not in config:
|
||||
config["fallback_providers"] = []
|
||||
|
||||
new_fallbacks = []
|
||||
for entry in fallback_chain:
|
||||
fb = {
|
||||
"provider": entry["provider"],
|
||||
"model": entry["model"],
|
||||
"timeout": entry.get("timeout", 120),
|
||||
}
|
||||
if entry.get("provider") == "openrouter":
|
||||
fb["base_url"] = "https://openrouter.ai/api/v1"
|
||||
fb["api_key_env"] = "OPENROUTER_API_KEY"
|
||||
if entry.get("provider") == "big_brain":
|
||||
fb["base_url"] = "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
|
||||
new_fallbacks.append(fb)
|
||||
|
||||
if config["fallback_providers"] == new_fallbacks:
|
||||
return False # No change needed
|
||||
|
||||
config["fallback_providers"] = new_fallbacks
|
||||
|
||||
if not dry_run:
|
||||
with open(AGENT_CONFIG_PATH, "w") as f:
|
||||
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def resurrect_agent(name: str, dry_run: bool = False) -> bool:
|
||||
svc = f"hermes-{name}.service"
|
||||
if dry_run:
|
||||
print(f"[DRY-RUN] Would restart {svc}")
|
||||
return True
|
||||
code, _, err = shell(f"systemctl restart {svc}")
|
||||
return code == 0
|
||||
|
||||
|
||||
def log_incident(event: dict):
|
||||
INCIDENT_LOG.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(INCIDENT_LOG, "a") as f:
|
||||
f.write(json.dumps(event) + "\n")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show actions without executing")
|
||||
args = parser.parse_args()
|
||||
|
||||
registry = load_registry()
|
||||
fleet = registry.get("fleet", {})
|
||||
provider_matrix = registry.get("provider_health_matrix", {})
|
||||
changed = False
|
||||
|
||||
for name, spec in fleet.items():
|
||||
result = check_agent(name, spec)
|
||||
actions = result.get("actions", [])
|
||||
|
||||
# Update provider matrix
|
||||
primary_provider = spec.get("primary", {}).get("provider")
|
||||
if primary_provider and primary_provider in provider_matrix:
|
||||
provider_matrix[primary_provider].update(result["primary_provider"])
|
||||
|
||||
# Rewrite fallback chain if needed (local only)
|
||||
if name == "bezalel":
|
||||
fb_chain = spec.get("fallback_chain", [])
|
||||
if rewrite_fallbacks(name, fb_chain, dry_run=args.dry_run):
|
||||
result["actions"].append("fallback_chain_rewritten")
|
||||
changed = True
|
||||
|
||||
# Resurrection logic — only for local agents
|
||||
agent_host = spec.get("host", "")
|
||||
is_local = agent_host in ("127.0.0.1", "localhost", "104.131.15.18") or not agent_host
|
||||
if is_local and ("gateway_unreachable" in actions or "service_inactive" in actions):
|
||||
if spec.get("auto_restart", False):
|
||||
ok = resurrect_agent(name, dry_run=args.dry_run)
|
||||
result["resurrected"] = ok
|
||||
result["actions"].append("auto_restart_executed" if ok else "auto_restart_failed")
|
||||
log_incident(result)
|
||||
changed = True
|
||||
|
||||
# Fallback promotion if primary is dead
|
||||
if "primary_dead" in actions:
|
||||
fb = spec.get("fallback_chain", [])
|
||||
if fb:
|
||||
healthy_fallback = None
|
||||
for candidate in fb:
|
||||
cand_provider = candidate["provider"]
|
||||
if provider_matrix.get(cand_provider, {}).get("status") == "healthy":
|
||||
healthy_fallback = candidate
|
||||
break
|
||||
if healthy_fallback:
|
||||
if not args.dry_run:
|
||||
spec["primary"] = healthy_fallback
|
||||
result["actions"].append(f"promoted_fallback_to_{healthy_fallback['provider']}")
|
||||
log_incident(result)
|
||||
changed = True
|
||||
|
||||
# Print summary
|
||||
status = "OK" if not actions else "ACTION"
|
||||
print(f"[{status}] {name}: {', '.join(actions) if actions else 'healthy'}")
|
||||
|
||||
if changed and not args.dry_run:
|
||||
registry["meta"]["updated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
save_registry(registry)
|
||||
print("\nRegistry updated.")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user