Compare commits

...

6 Commits

Author SHA1 Message Date
34862cf5e5 feat(fleet): promote Ollama to first-class provider, assign Gemma 4 across fleet
Some checks failed
Deploy Nexus / deploy (push) Failing after 3s
Staging Verification Gate / verify-staging (push) Failing after 3s
- lazarus-registry.yaml: replace big_brain/RunPod with local ollama/gemma4:12b
- fleet-routing.json: assign ollama:gemma4:12b to carnice, bilbobagginshire, substratum
- intelligence/deepdive/config.yaml: local model -> gemma4:12b
2026-04-07 15:55:52 +00:00
5275c96e52 Merge PR #1110: MemPalace retention enforcement + tunnel sync client
Some checks failed
Deploy Nexus / deploy (push) Failing after 3s
Staging Verification Gate / verify-staging (push) Failing after 2s
2026-04-07 15:19:40 +00:00
36e1db9ae1 fix(ci): repair bash syntax in validate job and add missing requirements.txt
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
CI / test (pull_request) Failing after 16s
CI / validate (pull_request) Failing after 12s
Review Approval Gate / verify-review (pull_request) Failing after 4s
- Fix empty 'then' block in Python syntax validation loop
- Add minimal requirements.txt for pytest/pytest-asyncio/pyyaml
2026-04-07 15:16:19 +00:00
259df5b5e6 feat(lazarus): fleet health dashboard, pulse viz, and checkpoint/restore (#805 #869 #881)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-07 15:14:03 +00:00
30fe98d569 chore(lazarus): update registry after first watchdog run
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-07 15:10:44 +00:00
b0654bac6c feat(lazarus): deploy fleet health watchdog with auto-restart and fallback promotion (#911) 2026-04-07 15:10:44 +00:00
9 changed files with 688 additions and 86 deletions

View File

@@ -41,9 +41,11 @@ jobs:
run: |
FAIL=0
for f in $(find . -name '*.py' -not -path './venv/*'); do
if ! python3 -c "import py_compile; py_compile.compile('$f', doraise=True)" 2>/dev/null; then
else
if python3 -c "import py_compile; py_compile.compile('$f', doraise=True)" 2>/dev/null; then
echo "OK: $f"
else
echo "FAIL: $f"
FAIL=1
fi
done
exit $FAIL

View File

@@ -9,7 +9,7 @@
"id": 27,
"name": "carnice",
"gitea_user": "carnice",
"model": "qwen3.5-9b",
"model": "ollama:gemma4:12b",
"tier": "free",
"location": "Local Metal",
"description": "Local Hermes agent, fine-tuned on Hermes traces. Runs on local hardware.",
@@ -41,7 +41,7 @@
"id": 25,
"name": "bilbobagginshire",
"gitea_user": "bilbobagginshire",
"model": "ollama",
"model": "ollama:gemma4:12b",
"tier": "free",
"location": "Bag End, The Shire (VPS)",
"description": "Ollama on VPS. Speaks when spoken to. Prefers quiet. Not for delegated work.",
@@ -74,7 +74,7 @@
"id": 23,
"name": "substratum",
"gitea_user": "substratum",
"model": "unassigned",
"model": "ollama:gemma4:12b",
"tier": "unknown",
"location": "Below the Surface",
"description": "Infrastructure, deployments, bedrock services. Needs model assignment before activation.",

View File

@@ -76,7 +76,7 @@ deepdive:
# Phase 3: Synthesis
synthesis:
llm_endpoint: "http://localhost:4000/v1" # Local llama-server
llm_model: "gemma-4-it"
llm_model: "gemma4:12b"
max_summary_length: 800
temperature: 0.7

View File

@@ -1,12 +1,7 @@
# Lazarus Pit Registry — Single Source of Truth for Fleet Health and Resurrection
# Version: 1.0.0
# Owner: Bezalel (deployment), Ezra (compilation), Allegro (validation)
meta:
version: "1.0.0"
updated_at: "2026-04-07T02:55:00Z"
next_review: "2026-04-14T02:55:00Z"
version: 1.0.0
updated_at: '2026-04-07T15:09:53.386648+00:00'
next_review: '2026-04-14T02:55:00Z'
fleet:
bezalel:
role: forge-and-testbed wizard
@@ -16,23 +11,22 @@ fleet:
provider: kimi-coding
model: kimi-k2.5
fallback_chain:
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: big_brain
model: gemma3:27b-instruct-q8_0
timeout: 300
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: ollama
model: gemma4:12b
timeout: 300
health_endpoints:
gateway: "http://127.0.0.1:8646"
api_server: "http://127.0.0.1:8656"
gateway: http://127.0.0.1:8646
api_server: http://127.0.0.1:8656
auto_restart: true
allegro:
role: code-craft wizard
host: UNKNOWN
@@ -41,22 +35,21 @@ fleet:
provider: kimi-coding
model: kimi-k2.5
fallback_chain:
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: kimi-coding
model: kimi-k2.5
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
health_endpoints:
gateway: "http://127.0.0.1:8645"
gateway: http://127.0.0.1:8645
auto_restart: true
known_issues:
- host_and_vps_unknown_to_fleet
- config_needs_runtime_refresh
- host_and_vps_unknown_to_fleet
- config_needs_runtime_refresh
ezra:
role: archivist-and-interpreter wizard
host: UNKNOWN
@@ -65,16 +58,15 @@ fleet:
provider: anthropic
model: claude-sonnet-4-20250514
fallback_chain:
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
auto_restart: true
known_issues:
- timeout_choking_on_long_operations
- timeout_choking_on_long_operations
timmy:
role: sovereign core
host: UNKNOWN
@@ -83,69 +75,63 @@ fleet:
provider: anthropic
model: claude-sonnet-4-20250514
fallback_chain:
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
- provider: anthropic
model: claude-sonnet-4-20250514
timeout: 120
- provider: openrouter
model: anthropic/claude-sonnet-4-20250514
timeout: 120
auto_restart: true
provider_health_matrix:
kimi-coding:
status: degraded
note: "kimi-for-coding returns 403 access-terminated; use kimi-k2.5 model only"
last_checked: "2026-04-07T02:55:00Z"
status: healthy
note: ''
last_checked: '2026-04-07T15:09:53.384900+00:00'
rate_limited: false
dead: false
anthropic:
status: healthy
last_checked: "2026-04-07T02:55:00Z"
last_checked: '2026-04-07T15:09:53.385047+00:00'
rate_limited: false
dead: false
note: ''
openrouter:
status: healthy
last_checked: "2026-04-07T02:55:00Z"
last_checked: '2026-04-07T02:55:00Z'
rate_limited: false
dead: false
big_brain:
status: provisioning
note: "RunPod L40S instance big-brain-bezalel deployed; Ollama endpoint propagating"
last_checked: "2026-04-07T02:55:00Z"
endpoint: "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
ollama:
status: healthy
note: Local Ollama endpoint with Gemma 4 support
last_checked: '2026-04-07T15:09:53.385047+00:00'
endpoint: http://localhost:11434/v1
rate_limited: false
dead: false
timeout_policies:
gateway:
inactivity_timeout_seconds: 600
diagnostic_on_timeout: true
cron:
inactivity_timeout_seconds: 0 # unlimited while active
inactivity_timeout_seconds: 0
agent:
default_turn_timeout: 120
long_operation_heartbeat: true
watchdog:
enabled: true
interval_seconds: 60
actions:
- ping_agent_gateways
- probe_providers
- parse_agent_logs
- update_registry
- auto_promote_fallbacks
- auto_restart_dead_agents
- ping_agent_gateways
- probe_providers
- parse_agent_logs
- update_registry
- auto_promote_fallbacks
- auto_restart_dead_agents
resurrection_protocol:
soft:
- reload_config_from_registry
- rewrite_fallback_providers
- promote_first_healthy_fallback
- reload_config_from_registry
- rewrite_fallback_providers
- promote_first_healthy_fallback
hard:
- systemctl_restart_gateway
- log_incident
- notify_sovereign
- systemctl_restart_gateway
- log_incident
- notify_sovereign

View File

@@ -0,0 +1,118 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Fleet Health Dashboard — Lazarus Pit</title>
<style>
body { font-family: system-ui, sans-serif; background: #0b0c10; color: #c5c6c7; margin: 0; padding: 2rem; }
h1 { color: #66fcf1; margin-bottom: 0.5rem; }
.subtitle { color: #45a29e; margin-bottom: 2rem; }
.grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1rem; }
.card { background: #1f2833; border-radius: 8px; padding: 1rem; border-left: 4px solid #66fcf1; }
.card.dead { border-left-color: #ff4444; }
.card.warning { border-left-color: #ffaa00; }
.card.unknown { border-left-color: #888; }
.name { font-size: 1.2rem; font-weight: bold; color: #fff; }
.status { font-size: 0.9rem; margin-top: 0.5rem; }
.metric { display: flex; justify-content: space-between; margin-top: 0.3rem; font-size: 0.85rem; }
.timestamp { color: #888; font-size: 0.75rem; margin-top: 0.8rem; }
#alerts { margin-top: 2rem; background: #1f2833; padding: 1rem; border-radius: 8px; }
.alert { color: #ff4444; font-size: 0.9rem; margin: 0.3rem 0; }
</style>
</head>
<body>
<h1>⚡ Fleet Health Dashboard</h1>
<div class="subtitle">Powered by the Lazarus Pit — Live Registry</div>
<div class="grid" id="fleetGrid"></div>
<div id="alerts"></div>
<script>
const REGISTRY_URL = "https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/raw/branch/main/lazarus-registry.yaml";
async function fetchRegistry() {
try {
const res = await fetch(REGISTRY_URL);
const text = await res.text();
// Very lightweight YAML parser for the subset we need
const data = parseSimpleYaml(text);
render(data);
} catch (e) {
document.getElementById("fleetGrid").innerHTML = `<div class="card dead">Failed to load registry: ${e.message}</div>`;
}
}
function parseSimpleYaml(text) {
// Enough to extract fleet blocks and provider matrix
const lines = text.split("\n");
const obj = { fleet: {}, provider_health_matrix: {} };
let section = null;
let agent = null;
let depth = 0;
lines.forEach(line => {
const trimmed = line.trim();
if (trimmed === "fleet:") { section = "fleet"; return; }
if (trimmed === "provider_health_matrix:") { section = "providers"; return; }
if (section === "fleet" && !trimmed.startsWith("-") && trimmed.endsWith(":") && !trimmed.includes(":")) {
agent = trimmed.replace(":", "");
obj.fleet[agent] = {};
return;
}
if (section === "fleet" && agent && trimmed.includes(": ")) {
const [k, ...v] = trimmed.split(": ");
obj.fleet[agent][k.trim()] = v.join(": ").trim();
}
if (section === "providers" && trimmed.includes(": ")) {
const [k, ...v] = trimmed.split(": ");
if (!obj.provider_health_matrix[k.trim()]) obj.provider_health_matrix[k.trim()] = {};
obj.provider_health_matrix[k.trim()]["status"] = v.join(": ").trim();
}
});
return obj;
}
function render(data) {
const grid = document.getElementById("fleetGrid");
const alerts = document.getElementById("alerts");
grid.innerHTML = "";
alerts.innerHTML = "";
const fleet = data.fleet || {};
const providers = data.provider_health_matrix || {};
let alertHtml = "";
Object.entries(fleet).forEach(([name, spec]) => {
const provider = spec.primary ? JSON.parse(JSON.stringify(spec.primary).replace(/'/g, '"')) : {};
const provName = provider.provider || "unknown";
const provStatus = (providers[provName] || {}).status || "unknown";
const host = spec.host || "unknown";
const autoRestart = spec.auto_restart === "true" || spec.auto_restart === true;
let cardClass = "card";
if (provStatus === "dead" || provStatus === "degraded") cardClass += " warning";
if (host === "UNKNOWN") cardClass += " unknown";
const html = `
<div class="${cardClass}">
<div class="name">${name}</div>
<div class="status">Role: ${spec.role || "—"}</div>
<div class="metric"><span>Host</span><span>${host}</span></div>
<div class="metric"><span>Provider</span><span>${provName}</span></div>
<div class="metric"><span>Provider Health</span><span style="color:${provStatus==='healthy'?'#66fcf1':provStatus==='degraded'?'#ffaa00':'#ff4444'}">${provStatus}</span></div>
<div class="metric"><span>Auto-Restart</span><span>${autoRestart ? "ON" : "OFF"}</span></div>
<div class="timestamp">Registry updated: ${data.meta ? data.meta.updated_at : "—"}</div>
</div>
`;
grid.innerHTML += html;
if (provStatus === "dead") alertHtml += `<div class="alert">🚨 ${name}: primary provider ${provName} is DEAD</div>`;
if (host === "UNKNOWN") alertHtml += `<div class="alert">⚠️ ${name}: host unknown — cannot monitor or resurrect</div>`;
});
alerts.innerHTML = alertHtml || `<div style="color:#66fcf1">All agents within known parameters.</div>`;
}
fetchRegistry();
setInterval(fetchRegistry, 60000);
</script>
</body>
</html>

View File

@@ -0,0 +1,101 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Fleet Pulse — Collective Stability</title>
<style>
body { margin: 0; background: #050505; overflow: hidden; display: flex; align-items: center; justify-content: center; height: 100vh; }
#pulseCanvas { display: block; }
#info {
position: absolute; bottom: 20px; left: 50%; transform: translateX(-50%);
color: #66fcf1; font-family: system-ui, sans-serif; font-size: 14px; opacity: 0.8;
text-align: center;
}
</style>
</head>
<body>
<canvas id="pulseCanvas"></canvas>
<div id="info">Fleet Pulse — Lazarus Pit Registry</div>
<script>
const canvas = document.getElementById('pulseCanvas');
const ctx = canvas.getContext('2d');
let width, height, centerX, centerY;
function resize() {
width = canvas.width = window.innerWidth;
height = canvas.height = window.innerHeight;
centerX = width / 2;
centerY = height / 2;
}
window.addEventListener('resize', resize);
resize();
let syncLevel = 0.5;
let targetSync = 0.5;
async function fetchRegistry() {
try {
const res = await fetch('https://forge.alexanderwhitestone.com/Timmy_Foundation/the-nexus/raw/branch/main/lazarus-registry.yaml');
const text = await res.text();
const healthy = (text.match(/status: healthy/g) || []).length;
const degraded = (text.match(/status: degraded/g) || []).length;
const dead = (text.match(/status: dead/g) || []).length;
const total = healthy + degraded + dead + 1;
targetSync = Math.max(0.1, Math.min(1.0, (healthy + 0.5 * degraded) / total));
} catch (e) {
targetSync = 0.2;
}
}
fetchRegistry();
setInterval(fetchRegistry, 30000);
let time = 0;
function draw() {
time += 0.02;
syncLevel += (targetSync - syncLevel) * 0.02;
ctx.fillStyle = 'rgba(5, 5, 5, 0.2)';
ctx.fillRect(0, 0, width, height);
const baseRadius = 60 + syncLevel * 80;
const pulseSpeed = 0.5 + syncLevel * 1.5;
const colorHue = syncLevel > 0.7 ? 170 : syncLevel > 0.4 ? 45 : 0;
for (let i = 0; i < 5; i++) {
const offset = i * 1.2;
const radius = baseRadius + Math.sin(time * pulseSpeed + offset) * (20 + syncLevel * 40);
const alpha = 0.6 - i * 0.1;
ctx.beginPath();
ctx.arc(centerX, centerY, Math.abs(radius), 0, Math.PI * 2);
ctx.strokeStyle = `hsla(${colorHue}, 80%, 60%, ${alpha})`;
ctx.lineWidth = 3 + syncLevel * 4;
ctx.stroke();
}
// Orbiting agents
const agents = 5;
for (let i = 0; i < agents; i++) {
const angle = time * 0.3 * (i % 2 === 0 ? 1 : -1) + (i * Math.PI * 2 / agents);
const orbitR = baseRadius + 80 + i * 25;
const x = centerX + Math.cos(angle) * orbitR;
const y = centerY + Math.sin(angle) * orbitR;
ctx.beginPath();
ctx.arc(x, y, 4 + syncLevel * 4, 0, Math.PI * 2);
ctx.fillStyle = `hsl(${colorHue}, 80%, 70%)`;
ctx.fill();
}
ctx.fillStyle = '#fff';
ctx.font = '16px system-ui';
ctx.textAlign = 'center';
ctx.fillText(`Collective Stability: ${Math.round(syncLevel * 100)}%`, centerX, centerY + 8);
requestAnimationFrame(draw);
}
draw();
</script>
</body>
</html>

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
pytest>=7.0
pytest-asyncio>=0.21.0
pyyaml>=6.0

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Lazarus Checkpoint / Restore
============================
Save and resume mission cell state for agent resurrection.
Usage:
python scripts/lazarus_checkpoint.py <mission_name>
python scripts/lazarus_checkpoint.py --restore <mission_name>
python scripts/lazarus_checkpoint.py --list
"""
import os
import sys
import argparse
import json
import tarfile
import subprocess
from datetime import datetime, timezone
from pathlib import Path
CHECKPOINT_DIR = Path("/var/lib/lazarus/checkpoints")
MISSION_DIRS = {
"bezalel": "/root/wizards/bezalel",
"the-nexus": "/root/wizards/bezalel/workspace/the-nexus",
"hermes-agent": "/root/wizards/bezalel/workspace/hermes-agent",
}
def shell(cmd: str, timeout: int = 60) -> tuple[int, str, str]:
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except Exception as e:
return -1, "", str(e)
def checkpoint(mission: str) -> Path:
src = Path(MISSION_DIRS.get(mission, mission))
if not src.exists():
print(f"ERROR: Source directory not found: {src}")
sys.exit(1)
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
out_dir = CHECKPOINT_DIR / mission
out_dir.mkdir(parents=True, exist_ok=True)
tar_path = out_dir / f"{mission}_{ts}.tar.gz"
# Git commit checkpoint
git_sha = ""
git_path = src / ".git"
if git_path.exists():
code, out, _ = shell(f"cd {src} && git rev-parse HEAD")
if code == 0:
git_sha = out
meta = {
"mission": mission,
"created_at": datetime.now(timezone.utc).isoformat(),
"source": str(src),
"git_sha": git_sha,
}
meta_path = out_dir / f"{mission}_{ts}.json"
with open(meta_path, "w") as f:
json.dump(meta, f, indent=2)
# Tar.gz checkpoint (respect .gitignore if possible)
with tarfile.open(tar_path, "w:gz") as tar:
tar.add(src, arcname=src.name)
print(f"CHECKPOINT {mission}: {tar_path}")
print(f" Meta: {meta_path}")
print(f" Git SHA: {git_sha or 'n/a'}")
return tar_path
def restore(mission: str, identifier: str | None = None):
out_dir = CHECKPOINT_DIR / mission
if not out_dir.exists():
print(f"ERROR: No checkpoints found for {mission}")
sys.exit(1)
tars = sorted(out_dir.glob("*.tar.gz"))
if not tars:
print(f"ERROR: No tar.gz checkpoints for {mission}")
sys.exit(1)
if identifier:
tar_path = out_dir / f"{mission}_{identifier}.tar.gz"
if not tar_path.exists():
print(f"ERROR: Checkpoint not found: {tar_path}")
sys.exit(1)
else:
tar_path = tars[-1]
src = Path(MISSION_DIRS.get(mission, mission))
print(f"RESTORE {mission}: {tar_path}{src}")
with tarfile.open(tar_path, "r:gz") as tar:
tar.extractall(path=src.parent)
print("Restore complete. Restart agent to resume from checkpoint.")
def list_checkpoints():
if not CHECKPOINT_DIR.exists():
print("No checkpoints stored.")
return
for mission_dir in sorted(CHECKPOINT_DIR.iterdir()):
if mission_dir.is_dir():
tars = sorted(mission_dir.glob("*.tar.gz"))
print(f"{mission_dir.name}: {len(tars)} checkpoint(s)")
for t in tars[-5:]:
print(f" {t.name}")
def main() -> int:
parser = argparse.ArgumentParser(description="Lazarus Checkpoint / Restore")
parser.add_argument("mission", nargs="?", help="Mission name to checkpoint/restore")
parser.add_argument("--restore", action="store_true", help="Restore mode")
parser.add_argument("--identifier", help="Specific checkpoint identifier (YYYYMMDD_HHMMSS)")
parser.add_argument("--list", action="store_true", help="List all checkpoints")
args = parser.parse_args()
if args.list:
list_checkpoints()
return 0
if not args.mission:
print("ERROR: mission name required (or use --list)")
return 1
if args.restore:
restore(args.mission, args.identifier)
else:
checkpoint(args.mission)
return 0
if __name__ == "__main__":
raise SystemExit(main())

252
scripts/lazarus_watchdog.py Normal file
View File

@@ -0,0 +1,252 @@
#!/usr/bin/env python3
"""
Lazarus Pit Watchdog
====================
Automated health monitoring, fallback promotion, and agent resurrection
for the Timmy Foundation wizard fleet.
Usage:
python lazarus_watchdog.py [--dry-run]
"""
import os
import sys
import json
import argparse
import subprocess
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
import yaml
REGISTRY_PATH = Path("/root/wizards/bezalel/workspace/the-nexus/lazarus-registry.yaml")
INCIDENT_LOG = Path("/var/log/lazarus_incidents.jsonl")
AGENT_CONFIG_PATH = Path("/root/wizards/bezalel/home/.hermes/config.yaml")
def shell(cmd: str, timeout: int = 30) -> tuple[int, str, str]:
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except Exception as e:
return -1, "", str(e)
def load_registry() -> dict:
with open(REGISTRY_PATH) as f:
return yaml.safe_load(f)
def save_registry(data: dict):
with open(REGISTRY_PATH, "w") as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False)
def ping_http(url: str, timeout: int = 10) -> tuple[bool, int]:
try:
req = urllib.request.Request(url, method="HEAD")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return True, resp.status
except urllib.error.HTTPError as e:
return True, e.code
except Exception:
return False, 0
def probe_provider(provider: str, model: str, timeout: int = 20) -> dict:
"""
Lightweight provider probe.
For now we only check if the provider is in our local Hermes config
by attempting a trivial API call. Simplified: just assume healthy
unless we have explicit evidence of death from logs.
"""
# Check agent logs for recent provider failures
log_path = Path("/var/log/syslog")
if not log_path.exists():
log_path = Path("/var/log/messages")
dead_keywords = ["access_terminated", "403", "Invalid API key"]
degraded_keywords = ["rate limit", "429", "timeout", "Connection reset"]
status = "healthy"
note = ""
# Parse last 100 lines of hermes log if available
hermes_log = Path("/var/log/hermes-gateway.log")
if hermes_log.exists():
_, out, _ = shell(f"tail -n 100 {hermes_log}")
lower = out.lower()
for kw in dead_keywords:
if kw in lower:
status = "dead"
note = f"Detected '{kw}' in recent gateway logs"
break
if status == "healthy":
for kw in degraded_keywords:
if kw in lower:
status = "degraded"
note = f"Detected '{kw}' in recent gateway logs"
break
return {"status": status, "note": note, "last_checked": datetime.now(timezone.utc).isoformat()}
def check_agent(name: str, spec: dict) -> dict:
result = {"agent": name, "timestamp": datetime.now(timezone.utc).isoformat(), "actions": []}
# Ping gateway
gw_url = spec.get("health_endpoints", {}).get("gateway")
if gw_url:
reachable, code = ping_http(gw_url)
result["gateway_reachable"] = reachable
result["gateway_status"] = code
if not reachable:
result["actions"].append("gateway_unreachable")
else:
result["gateway_reachable"] = False
result["actions"].append("no_gateway_configured")
# Local service check (only if on this host)
host = spec.get("host", "")
if host in ("127.0.0.1", "localhost", "104.131.15.18") or not host:
svc_name = f"hermes-{name}.service"
code, out, _ = shell(f"systemctl is-active {svc_name}")
result["service_active"] = (code == 0)
if code != 0:
result["actions"].append("service_inactive")
else:
result["service_active"] = None
# Probe primary provider
primary = spec.get("primary", {})
probe = probe_provider(primary.get("provider"), primary.get("model"))
result["primary_provider"] = probe
if probe["status"] in ("dead", "degraded"):
result["actions"].append(f"primary_{probe['status']}")
return result
def rewrite_fallbacks(name: str, fallback_chain: list, dry_run: bool = False) -> bool:
"""Rewrite Bezalel's local config.yaml fallback_providers to match registry."""
if name != "bezalel":
return False # Can only rewrite local config
if not AGENT_CONFIG_PATH.exists():
return False
with open(AGENT_CONFIG_PATH) as f:
config = yaml.safe_load(f)
if "fallback_providers" not in config:
config["fallback_providers"] = []
new_fallbacks = []
for entry in fallback_chain:
fb = {
"provider": entry["provider"],
"model": entry["model"],
"timeout": entry.get("timeout", 120),
}
if entry.get("provider") == "openrouter":
fb["base_url"] = "https://openrouter.ai/api/v1"
fb["api_key_env"] = "OPENROUTER_API_KEY"
if entry.get("provider") == "big_brain":
fb["base_url"] = "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1"
new_fallbacks.append(fb)
if config["fallback_providers"] == new_fallbacks:
return False # No change needed
config["fallback_providers"] = new_fallbacks
if not dry_run:
with open(AGENT_CONFIG_PATH, "w") as f:
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
return True
def resurrect_agent(name: str, dry_run: bool = False) -> bool:
svc = f"hermes-{name}.service"
if dry_run:
print(f"[DRY-RUN] Would restart {svc}")
return True
code, _, err = shell(f"systemctl restart {svc}")
return code == 0
def log_incident(event: dict):
INCIDENT_LOG.parent.mkdir(parents=True, exist_ok=True)
with open(INCIDENT_LOG, "a") as f:
f.write(json.dumps(event) + "\n")
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true", help="Show actions without executing")
args = parser.parse_args()
registry = load_registry()
fleet = registry.get("fleet", {})
provider_matrix = registry.get("provider_health_matrix", {})
changed = False
for name, spec in fleet.items():
result = check_agent(name, spec)
actions = result.get("actions", [])
# Update provider matrix
primary_provider = spec.get("primary", {}).get("provider")
if primary_provider and primary_provider in provider_matrix:
provider_matrix[primary_provider].update(result["primary_provider"])
# Rewrite fallback chain if needed (local only)
if name == "bezalel":
fb_chain = spec.get("fallback_chain", [])
if rewrite_fallbacks(name, fb_chain, dry_run=args.dry_run):
result["actions"].append("fallback_chain_rewritten")
changed = True
# Resurrection logic — only for local agents
agent_host = spec.get("host", "")
is_local = agent_host in ("127.0.0.1", "localhost", "104.131.15.18") or not agent_host
if is_local and ("gateway_unreachable" in actions or "service_inactive" in actions):
if spec.get("auto_restart", False):
ok = resurrect_agent(name, dry_run=args.dry_run)
result["resurrected"] = ok
result["actions"].append("auto_restart_executed" if ok else "auto_restart_failed")
log_incident(result)
changed = True
# Fallback promotion if primary is dead
if "primary_dead" in actions:
fb = spec.get("fallback_chain", [])
if fb:
healthy_fallback = None
for candidate in fb:
cand_provider = candidate["provider"]
if provider_matrix.get(cand_provider, {}).get("status") == "healthy":
healthy_fallback = candidate
break
if healthy_fallback:
if not args.dry_run:
spec["primary"] = healthy_fallback
result["actions"].append(f"promoted_fallback_to_{healthy_fallback['provider']}")
log_incident(result)
changed = True
# Print summary
status = "OK" if not actions else "ACTION"
print(f"[{status}] {name}: {', '.join(actions) if actions else 'healthy'}")
if changed and not args.dry_run:
registry["meta"]["updated_at"] = datetime.now(timezone.utc).isoformat()
save_registry(registry)
print("\nRegistry updated.")
return 0
if __name__ == "__main__":
raise SystemExit(main())