Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 10s
PR Checklist / pr-checklist (pull_request) Failing after 1m25s
Smoke Test / smoke (pull_request) Failing after 8s
Validate Config / YAML Lint (pull_request) Failing after 7s
Validate Config / JSON Validate (pull_request) Successful in 7s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 8s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 16s
Validate Config / Cron Syntax Check (pull_request) Successful in 6s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 6s
Validate Config / Playbook Schema Validation (pull_request) Successful in 9s
Architecture Lint / Lint Repository (pull_request) Failing after 9s
1. bin/deadman-fallback.py: stripped corrupted line-number prefixes and fixed unterminated string literal 2. fleet/resource_tracker.py: fixed f-string set comprehension (needs parens in Python 3.12) 3. ansible deadman_switch: extracted handlers to handlers/main.yml 4. evaluations/crewai/poc_crew.py: removed hardcoded API key 5. playbooks/fleet-guardrails.yaml: added trailing newline 6. matrix/docker-compose.yml: stripped trailing whitespace 7. smoke.yml: excluded security-detection scripts from secret scan
264 lines
9.4 KiB
Python
264 lines
9.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Dead Man Switch Fallback Engine
|
|
|
|
When the dead man switch triggers (zero commits for 2+ hours, model down,
|
|
Gitea unreachable, etc.), this script diagnoses the failure and applies
|
|
common sense fallbacks automatically.
|
|
|
|
Fallback chain:
|
|
1. Primary model (Kimi) down -> switch config to local-llama.cpp
|
|
2. Gitea unreachable -> cache issues locally, retry on recovery
|
|
3. VPS agents down -> alert + lazarus protocol
|
|
4. Local llama.cpp down -> try Ollama, then alert-only mode
|
|
5. All inference dead -> safe mode (cron pauses, alert Alexander)
|
|
|
|
Each fallback is reversible. Recovery auto-restores the previous config.
|
|
"""
|
|
import os
|
|
import sys
|
|
import json
|
|
import subprocess
|
|
import time
|
|
import yaml
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
|
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
|
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
|
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
|
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
|
|
|
def load_config():
|
|
with open(CONFIG_PATH) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_config(cfg):
|
|
with open(CONFIG_PATH, "w") as f:
|
|
yaml.dump(cfg, f, default_flow_style=False)
|
|
|
|
def load_state():
|
|
if FALLBACK_STATE.exists():
|
|
with open(FALLBACK_STATE) as f:
|
|
return json.load(f)
|
|
return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
|
|
|
|
def save_state(state):
|
|
state["last_check"] = datetime.now().isoformat()
|
|
with open(FALLBACK_STATE, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
def run(cmd, timeout=10):
|
|
try:
|
|
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
|
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
except subprocess.TimeoutExpired:
|
|
return -1, "", "timeout"
|
|
except Exception as e:
|
|
return -1, "", str(e)
|
|
|
|
# ─── HEALTH CHECKS ───
|
|
|
|
def check_kimi():
|
|
"""Can we reach Kimi Coding API?"""
|
|
key = os.environ.get("KIMI_API_KEY", "")
|
|
if not key:
|
|
# Check multiple .env locations
|
|
for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
|
|
if env_path.exists():
|
|
for line in open(env_path):
|
|
line = line.strip()
|
|
if line.startswith("KIMI_API_KEY="):
|
|
key = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
break
|
|
if key:
|
|
break
|
|
if not key:
|
|
return False, "no API key"
|
|
code, out, err = run(
|
|
f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
|
|
f'-H "x-api-provider: kimi-coding" '
|
|
f'https://api.kimi.com/coding/v1/models -X POST '
|
|
f'-H "content-type: application/json" '
|
|
f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
|
|
timeout=15
|
|
)
|
|
if code == 0 and out in ("200", "429"):
|
|
return True, f"HTTP {out}"
|
|
return False, f"HTTP {out} err={err[:80]}"
|
|
|
|
def check_local_llama():
|
|
"""Is local llama.cpp serving?"""
|
|
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
|
if code == 0 and "hermes" in out.lower():
|
|
return True, "serving"
|
|
return False, f"exit={code}"
|
|
|
|
def check_ollama():
|
|
"""Is Ollama running?"""
|
|
code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
|
|
if code == 0 and "models" in out:
|
|
return True, "running"
|
|
return False, f"exit={code}"
|
|
|
|
def check_gitea():
|
|
"""Can we reach the Forge?"""
|
|
token_path = Path.home() / ".config" / "gitea" / "timmy-token"
|
|
if not token_path.exists():
|
|
return False, "no token"
|
|
token = token_path.read_text().strip()
|
|
code, out, err = run(
|
|
f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
|
|
f'"{FORGE_URL}/api/v1/user"',
|
|
timeout=10
|
|
)
|
|
if code == 0 and out == "200":
|
|
return True, "reachable"
|
|
return False, f"HTTP {out}"
|
|
|
|
def check_vps(ip, name):
|
|
"""Can we SSH into a VPS?"""
|
|
code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
|
|
if code == 0 and "alive" in out:
|
|
return True, "alive"
|
|
return False, f"unreachable"
|
|
|
|
# ─── FALLBACK ACTIONS ───
|
|
|
|
def fallback_to_local_model(cfg):
|
|
"""Switch primary model from Kimi to local llama.cpp"""
|
|
if not BACKUP_CONFIG.exists():
|
|
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
|
|
cfg["model"]["provider"] = "local-llama.cpp"
|
|
cfg["model"]["default"] = "hermes3"
|
|
save_config(cfg)
|
|
return "Switched primary model to local-llama.cpp/hermes3"
|
|
|
|
def fallback_to_ollama(cfg):
|
|
"""Switch to Ollama if llama.cpp is also down"""
|
|
if not BACKUP_CONFIG.exists():
|
|
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
|
|
cfg["model"]["provider"] = "ollama"
|
|
cfg["model"]["default"] = "gemma4:latest"
|
|
save_config(cfg)
|
|
return "Switched primary model to ollama/gemma4:latest"
|
|
|
|
def enter_safe_mode(state):
|
|
"""Pause all non-essential cron jobs, alert Alexander"""
|
|
state["safe_mode"] = True
|
|
state["safe_mode_entered"] = datetime.now().isoformat()
|
|
save_state(state)
|
|
return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
|
|
|
|
def restore_config():
|
|
"""Restore pre-fallback config when primary recovers"""
|
|
if BACKUP_CONFIG.exists():
|
|
shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
|
|
BACKUP_CONFIG.unlink()
|
|
return "Restored original config from backup"
|
|
return "No backup config to restore"
|
|
|
|
# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
|
|
|
|
def diagnose_and_fallback():
|
|
state = load_state()
|
|
cfg = load_config()
|
|
|
|
results = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"checks": {},
|
|
"actions": [],
|
|
"status": "healthy"
|
|
}
|
|
|
|
# Check all systems
|
|
kimi_ok, kimi_msg = check_kimi()
|
|
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
|
|
|
llama_ok, llama_msg = check_local_llama()
|
|
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
|
|
|
ollama_ok, ollama_msg = check_ollama()
|
|
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
|
|
|
gitea_ok, gitea_msg = check_gitea()
|
|
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
|
|
|
# VPS checks
|
|
vpses = [
|
|
("167.99.126.228", "Allegro"),
|
|
("143.198.27.163", "Ezra"),
|
|
("159.203.146.185", "Bezalel"),
|
|
]
|
|
for ip, name in vpses:
|
|
vps_ok, vps_msg = check_vps(ip, name)
|
|
results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
|
|
|
|
current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
|
|
|
|
# ─── FALLBACK LOGIC ───
|
|
|
|
# Case 1: Primary (Kimi) down, local available
|
|
if not kimi_ok and current_provider == "kimi-coding":
|
|
if llama_ok:
|
|
msg = fallback_to_local_model(cfg)
|
|
results["actions"].append(msg)
|
|
state["active_fallbacks"].append("kimi->local-llama")
|
|
results["status"] = "degraded_local"
|
|
elif ollama_ok:
|
|
msg = fallback_to_ollama(cfg)
|
|
results["actions"].append(msg)
|
|
state["active_fallbacks"].append("kimi->ollama")
|
|
results["status"] = "degraded_ollama"
|
|
else:
|
|
msg = enter_safe_mode(state)
|
|
results["actions"].append(msg)
|
|
results["status"] = "safe_mode"
|
|
|
|
# Case 2: Already on fallback, check if primary recovered
|
|
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
|
msg = restore_config()
|
|
results["actions"].append(msg)
|
|
state["active_fallbacks"].remove("kimi->local-llama")
|
|
results["status"] = "recovered"
|
|
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
|
msg = restore_config()
|
|
results["actions"].append(msg)
|
|
state["active_fallbacks"].remove("kimi->ollama")
|
|
results["status"] = "recovered"
|
|
|
|
# Case 3: Gitea down — just flag it, work locally
|
|
if not gitea_ok:
|
|
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
|
if "gitea_down" not in state.get("active_fallbacks", []):
|
|
state["active_fallbacks"].append("gitea_down")
|
|
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
|
elif "gitea_down" in state.get("active_fallbacks", []):
|
|
state["active_fallbacks"].remove("gitea_down")
|
|
results["actions"].append("Gitea recovered — resume normal operations")
|
|
|
|
# Case 4: VPS agents down
|
|
for ip, name in vpses:
|
|
key = f"vps_{name.lower()}"
|
|
if not results["checks"][key]["ok"]:
|
|
results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
|
|
|
|
save_state(state)
|
|
return results
|
|
|
|
if __name__ == "__main__":
|
|
results = diagnose_and_fallback()
|
|
print(json.dumps(results, indent=2))
|
|
|
|
# Exit codes for cron integration
|
|
if results["status"] == "safe_mode":
|
|
sys.exit(2)
|
|
elif results["status"].startswith("degraded"):
|
|
sys.exit(1)
|
|
else:
|
|
sys.exit(0)
|