#!/usr/bin/env python3 """ Lazarus Pit Watchdog ==================== Automated health monitoring, fallback promotion, and agent resurrection for the Timmy Foundation wizard fleet. Usage: python lazarus_watchdog.py [--dry-run] """ import os import sys import json import argparse import subprocess import urllib.request from datetime import datetime, timezone from pathlib import Path import yaml REGISTRY_PATH = Path("/root/wizards/bezalel/workspace/the-nexus/lazarus-registry.yaml") INCIDENT_LOG = Path("/var/log/lazarus_incidents.jsonl") AGENT_CONFIG_PATH = Path("/root/wizards/bezalel/home/.hermes/config.yaml") def shell(cmd: str, timeout: int = 30) -> tuple[int, str, str]: try: r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) return r.returncode, r.stdout.strip(), r.stderr.strip() except Exception as e: return -1, "", str(e) def load_registry() -> dict: with open(REGISTRY_PATH) as f: return yaml.safe_load(f) def save_registry(data: dict): with open(REGISTRY_PATH, "w") as f: yaml.dump(data, f, default_flow_style=False, sort_keys=False) def ping_http(url: str, timeout: int = 10) -> tuple[bool, int]: try: req = urllib.request.Request(url, method="HEAD") with urllib.request.urlopen(req, timeout=timeout) as resp: return True, resp.status except urllib.error.HTTPError as e: return True, e.code except Exception: return False, 0 def probe_provider(provider: str, model: str, timeout: int = 20) -> dict: """ Lightweight provider probe. For now we only check if the provider is in our local Hermes config by attempting a trivial API call. Simplified: just assume healthy unless we have explicit evidence of death from logs. """ # Check agent logs for recent provider failures log_path = Path("/var/log/syslog") if not log_path.exists(): log_path = Path("/var/log/messages") dead_keywords = ["access_terminated", "403", "Invalid API key"] degraded_keywords = ["rate limit", "429", "timeout", "Connection reset"] status = "healthy" note = "" # Parse last 100 lines of hermes log if available hermes_log = Path("/var/log/hermes-gateway.log") if hermes_log.exists(): _, out, _ = shell(f"tail -n 100 {hermes_log}") lower = out.lower() for kw in dead_keywords: if kw in lower: status = "dead" note = f"Detected '{kw}' in recent gateway logs" break if status == "healthy": for kw in degraded_keywords: if kw in lower: status = "degraded" note = f"Detected '{kw}' in recent gateway logs" break return {"status": status, "note": note, "last_checked": datetime.now(timezone.utc).isoformat()} def check_agent(name: str, spec: dict) -> dict: result = {"agent": name, "timestamp": datetime.now(timezone.utc).isoformat(), "actions": []} # Ping gateway gw_url = spec.get("health_endpoints", {}).get("gateway") if gw_url: reachable, code = ping_http(gw_url) result["gateway_reachable"] = reachable result["gateway_status"] = code if not reachable: result["actions"].append("gateway_unreachable") else: result["gateway_reachable"] = False result["actions"].append("no_gateway_configured") # Local service check (only if on this host) host = spec.get("host", "") if host in ("127.0.0.1", "localhost", "104.131.15.18") or not host: svc_name = f"hermes-{name}.service" code, out, _ = shell(f"systemctl is-active {svc_name}") result["service_active"] = (code == 0) if code != 0: result["actions"].append("service_inactive") else: result["service_active"] = None # Probe primary provider primary = spec.get("primary", {}) probe = probe_provider(primary.get("provider"), primary.get("model")) result["primary_provider"] = probe if probe["status"] in ("dead", "degraded"): result["actions"].append(f"primary_{probe['status']}") return result def rewrite_fallbacks(name: str, fallback_chain: list, dry_run: bool = False) -> bool: """Rewrite Bezalel's local config.yaml fallback_providers to match registry.""" if name != "bezalel": return False # Can only rewrite local config if not AGENT_CONFIG_PATH.exists(): return False with open(AGENT_CONFIG_PATH) as f: config = yaml.safe_load(f) if "fallback_providers" not in config: config["fallback_providers"] = [] new_fallbacks = [] for entry in fallback_chain: fb = { "provider": entry["provider"], "model": entry["model"], "timeout": entry.get("timeout", 120), } if entry.get("provider") == "openrouter": fb["base_url"] = "https://openrouter.ai/api/v1" fb["api_key_env"] = "OPENROUTER_API_KEY" if entry.get("provider") == "big_brain": fb["base_url"] = "http://yxw29g3excyddq-64411cd0-11434.tcp.runpod.net:11434/v1" new_fallbacks.append(fb) if config["fallback_providers"] == new_fallbacks: return False # No change needed config["fallback_providers"] = new_fallbacks if not dry_run: with open(AGENT_CONFIG_PATH, "w") as f: yaml.dump(config, f, default_flow_style=False, sort_keys=False) return True def resurrect_agent(name: str, dry_run: bool = False) -> bool: svc = f"hermes-{name}.service" if dry_run: print(f"[DRY-RUN] Would restart {svc}") return True code, _, err = shell(f"systemctl restart {svc}") return code == 0 def log_incident(event: dict): INCIDENT_LOG.parent.mkdir(parents=True, exist_ok=True) with open(INCIDENT_LOG, "a") as f: f.write(json.dumps(event) + "\n") def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true", help="Show actions without executing") args = parser.parse_args() registry = load_registry() fleet = registry.get("fleet", {}) provider_matrix = registry.get("provider_health_matrix", {}) changed = False for name, spec in fleet.items(): result = check_agent(name, spec) actions = result.get("actions", []) # Update provider matrix primary_provider = spec.get("primary", {}).get("provider") if primary_provider and primary_provider in provider_matrix: provider_matrix[primary_provider].update(result["primary_provider"]) # Rewrite fallback chain if needed (local only) if name == "bezalel": fb_chain = spec.get("fallback_chain", []) if rewrite_fallbacks(name, fb_chain, dry_run=args.dry_run): result["actions"].append("fallback_chain_rewritten") changed = True # Resurrection logic — only for local agents agent_host = spec.get("host", "") is_local = agent_host in ("127.0.0.1", "localhost", "104.131.15.18") or not agent_host if is_local and ("gateway_unreachable" in actions or "service_inactive" in actions): if spec.get("auto_restart", False): ok = resurrect_agent(name, dry_run=args.dry_run) result["resurrected"] = ok result["actions"].append("auto_restart_executed" if ok else "auto_restart_failed") log_incident(result) changed = True # Fallback promotion if primary is dead if "primary_dead" in actions: fb = spec.get("fallback_chain", []) if fb: healthy_fallback = None for candidate in fb: cand_provider = candidate["provider"] if provider_matrix.get(cand_provider, {}).get("status") == "healthy": healthy_fallback = candidate break if healthy_fallback: if not args.dry_run: spec["primary"] = healthy_fallback result["actions"].append(f"promoted_fallback_to_{healthy_fallback['provider']}") log_incident(result) changed = True # Print summary status = "OK" if not actions else "ACTION" print(f"[{status}] {name}: {', '.join(actions) if actions else 'healthy'}") if changed and not args.dry_run: registry["meta"]["updated_at"] = datetime.now(timezone.utc).isoformat() save_registry(registry) print("\nRegistry updated.") return 0 if __name__ == "__main__": raise SystemExit(main())