bin/deadman-fallback.py

#!/usr/bin/env python3
"""
Dead Man Switch Fallback Engine

When the dead man switch triggers (zero commits for 2+ hours, model down,
Gitea unreachable, etc.), this script diagnoses the failure and applies
common sense fallbacks automatically.

Fallback chain:
1. Primary model (Kimi or Anthropic) down -> switch config to local-llama.cpp
2. Gitea unreachable -> cache issues locally, retry on recovery
3. VPS agents down -> alert + lazarus protocol
4. Local llama.cpp down -> try Ollama, then alert-only mode
5. All inference dead -> safe mode (cron pauses, alert Alexander)

Each fallback is reversible. Recovery auto-restores the previous config.
"""
import os
import sys
import json
import subprocess
import time
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timedelta

HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
CONFIG_PATH = HERMES_HOME / "config.yaml"
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
FORGE_URL = "https://forge.alexanderwhitestone.com"

def load_config():
    with open(CONFIG_PATH) as f:
        return yaml.safe_load(f)

def save_config(cfg):
    with open(CONFIG_PATH, "w") as f:
        yaml.dump(cfg, f, default_flow_style=False)

def load_state():
    if FALLBACK_STATE.exists():
        with open(FALLBACK_STATE) as f:
            return json.load(f)
    return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}

def save_state(state):
    state["last_check"] = datetime.now().isoformat()
    with open(FALLBACK_STATE, "w") as f:
        json.dump(state, f, indent=2)

def run(cmd, timeout=10):
    try:
        r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
        return r.returncode, r.stdout.strip(), r.stderr.strip()
    except subprocess.TimeoutExpired:
        return -1, "", "timeout"
    except Exception as e:
        return -1, "", str(e)

# ─── HEALTH CHECKS ───

def check_kimi():
    """Can we reach Kimi Coding API?"""
    key = os.environ.get("KIMI_API_KEY", "")
    if not key:
        # Check multiple .env locations
        for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
            if env_path.exists():
                for line in open(env_path):
                    line = line.strip()
                    if line.startswith("KIMI_API_KEY="):
                        key = line.split("=", 1)[1].strip().strip('"').strip("'")
                        break
            if key:
                break
    if not key:
        return False, "no API key"
    code, out, err = run(
        f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
        f'-H "x-api-provider: kimi-coding" '
        f'https://api.kimi.com/coding/v1/models -X POST '
        f'-H "content-type: application/json" '
        f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
        timeout=15
    )
    if code == 0 and out in ("200", "429"):
        return True, f"HTTP {out}"
    return False, f"HTTP {out} err={err[:80]}"


def check_anthropic():
    """Can we reach Anthropic API?"""
    key = os.environ.get("ANTHROPIC_API_KEY", "")
    if not key:
        # Check multiple .env locations
        for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
            if env_path.exists():
                for line in open(env_path):
                    line = line.strip()
                    if line.startswith("ANTHROPIC_API_KEY="):
                        key = line.split("=", 1)[1].strip().strip('"').strip("'")
                        break
            if key:
                break
    if not key:
        return False, "no API key"
    code, out, err = run(
        f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
        f'-H "anthropic-version: 2023-06-01" '
        f'https://api.anthropic.com/v1/messages -X POST '
        f'-H "content-type: application/json" '
        f'-d \'{{"model":"claude-opus-4-20260514","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
        timeout=15
    )
    if code == 0 and out in ("200", "202", "429"):
        return True, f"HTTP {out}"
    return False, f"HTTP {out} err={err[:80]}"

def check_local_llama():
    """Is local llama.cpp serving?"""
    code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
    if code == 0 and "hermes" in out.lower():
        return True, "serving"
    return False, f"exit={code}"

def check_ollama():
    """Is Ollama running?"""
    code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
    if code == 0 and "models" in out:
        return True, "running"
    return False, f"exit={code}"

def check_gitea():
    """Can we reach the Forge?"""
    token_path = Path.home() / ".config" / "gitea" / "timmy-token"
    if not token_path.exists():
        return False, "no token"
    token = token_path.read_text().strip()
    code, out, err = run(
        f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
        f'"{FORGE_URL}/api/v1/user"',
        timeout=10
    )
    if code == 0 and out == "200":
        return True, "reachable"
    return False, f"HTTP {out}"

def check_vps(ip, name):
    """Can we SSH into a VPS?"""
    code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
    if code == 0 and "alive" in out:
        return True, "alive"
    return False, f"unreachable"

# ─── FALLBACK ACTIONS ───

def fallback_to_local_model(cfg):
    """Switch primary model to local-llama.cpp (used by any provider)"""
    if not BACKUP_CONFIG.exists():
        shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
    
    cfg["model"]["provider"] = "local-llama.cpp"
    cfg["model"]["default"] = "hermes3"
    save_config(cfg)
    return "Switched primary model to local-llama.cpp/hermes3"

def fallback_to_ollama(cfg):
    """Switch to Ollama if local-llama is also down"""
    if not BACKUP_CONFIG.exists():
        shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
    
    cfg["model"]["provider"] = "ollama"
    cfg["model"]["default"] = "gemma4:latest"
    save_config(cfg)
    return "Switched primary model to ollama/gemma4:latest"

def enter_safe_mode(state):
    """Pause all non-essential cron jobs, alert Alexander"""
    state["safe_mode"] = True
    state["safe_mode_entered"] = datetime.now().isoformat()
    save_state(state)
    return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."

def restore_config():
    """Restore pre-fallback config when primary recovers"""
    if BACKUP_CONFIG.exists():
        shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
        BACKUP_CONFIG.unlink()
        return "Restored original config from backup"
    return "No backup config to restore"

# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───

def diagnose_and_fallback():
    state = load_state()
    cfg = load_config()
    
    results = {
        "timestamp": datetime.now().isoformat(),
        "checks": {},
        "actions": [],
        "status": "healthy"
    }
    
    # Identify current provider from config
    current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
    
    # Dispatch to the correct health check
    provider_check = {
        "kimi-coding": check_kimi,
        "anthropic":   check_anthropic,
    }.get(current_provider, check_kimi)
    
    primary_ok, primary_msg = provider_check()
    results["checks"]["primary"] = {"ok": primary_ok, "msg": primary_msg}
    
    llama_ok, llama_msg   = check_local_llama()
    ollama_ok, ollama_msg = check_ollama()
    gitea_ok,  gitea_msg   = check_gitea()
    results["checks"]["local_llama"] = {"ok": llama_ok,  "msg": llama_msg}
    results["checks"]["ollama"]      = {"ok": ollama_ok, "msg": ollama_msg}
    results["checks"]["gitea"]       = {"ok": gitea_ok,  "msg": gitea_msg}
    
    # VPS health
    vpses = [("167.99.126.228","Allegro"), ("143.198.27.163","Ezra"), ("159.203.146.185","Bezalel")]
    for ip,name in vpses:
        ok,msg = check_vps(ip,name)
        results["checks"][f"vps_{name.lower()}"] = {"ok": ok, "msg": msg}
    
    # ─── FALLBACK LOGIC ───
    
    # Primary provider down → try local llama, then ollama
    if not primary_ok:
        if llama_ok:
            msg = fallback_to_local_model(cfg)
            results["actions"].append(msg)
            state["active_fallbacks"].append(f"{current_provider}->local-llama")
            results["status"] = "degraded_local"
        elif ollama_ok:
            msg = fallback_to_ollama(cfg)
            results["actions"].append(msg)
            state["active_fallbacks"].append(f"{current_provider}->ollama")
            results["status"] = "degraded_ollama"
        else:
            msg = enter_safe_mode(state)
            results["actions"].append(msg)
            results["status"] = "safe_mode"
    
    # On fallback → watch for primary recovery
    elif primary_ok:
        recovered = False
        for tag in [f"{current_provider}->local-llama", f"{current_provider}->ollama"]:
            if tag in state.get("active_fallbacks", []):
                msg = restore_config()
                results["actions"].append(msg)
                state["active_fallbacks"].remove(tag)
                results["status"] = "recovered"
                recovered = True
                break
        # If none matched, status remains healthy
    
    # Gitea unreachable → work locally, avoid PRs
    if not gitea_ok:
        results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
        if "gitea_down" not in state.get("active_fallbacks", []):
            state["active_fallbacks"].append("gitea_down")
        # demote status if not already worse than degraded_gitea
        order = ["healthy","recovered","degraded_gitea","degraded_local","degraded_ollama","safe_mode"]
        if order.index(results["status"]) < order.index("degraded_gitea"):
            results["status"] = "degraded_gitea"
    elif "gitea_down" in state.get("active_fallbacks", []):
        state["active_fallbacks"].remove("gitea_down")
        results["actions"].append("Gitea recovered — resume normal operations")
    
    # VPS alerts (informational)
    for ip,name in vpses:
        key = f"vps_{name.lower()}"
        if not results["checks"][key]["ok"]:
            results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
    
    save_state(state)
    return results

if __name__ == "__main__":
    results = diagnose_and_fallback()
    print(json.dumps(results, indent=2))
    
    # Exit codes for cron integration
    if results["status"] == "safe_mode":
        sys.exit(2)
    elif results["status"].startswith("degraded"):
        sys.exit(1)
    else:
        sys.exit(0)