deadman: wire ACTION — per-wizard snapshots, cron schedule, config hash logging, kill overlaps

- Move deadman scheduling to Ansible cron_manager (universal cron) - Remove systemd timer & launchd plist deployment from deadman_switch role - Add deadman action cron job to group_vars (runs every 5 min) - Change snapshot_dir to per-wizard: {{ wizard_home }}/.snapshots - Add config hash logging to rollback for audit trail (old_hash, new_hash) - Delete overlapping deadman implementations (deadman-switch.sh, deadman-fallback.py) - cron_manager: remove fallback task for deadman (now unified via cron_jobs) Acceptance Criteria: - Health check success → snapshot saved ✓ - Health check failure → rollback + restart ✓ - Rollback event logged with config hashes ✓ - Snapshot location per-agent (wizard-specific) ✓ - Works with Ansible-deployed cron schedule ✓ - All overlapping deadman switches removed ✓ (single implementation) Closes #444
2026-04-26 14:22:30 -04:00
6 changed files with 16 additions and 378 deletions
--- a/ansible/inventory/group_vars/wizards.yml
+++ b/ansible/inventory/group_vars/wizards.yml
@@ -8,7 +8,7 @@
 # --- Deadman Switch ---
 deadman_enabled: true
 deadman_check_interval: 300    # 5 minutes between health checks
-deadman_snapshot_dir: "~/.local/timmy/snapshots"
+deadman_snapshot_dir: "{{ wizard_home }}/.snapshots"
 deadman_max_snapshots: 10      # Rolling window of good configs
 deadman_restart_cooldown: 60   # Seconds to wait before restart after failure
 deadman_max_restart_attempts: 3
@@ -50,6 +50,12 @@ cron_jobs:
    hour: "*"
    enabled: "{{ deadman_enabled }}"

+  - name: "Deadman switch action — {{ wizard_name }}"
+    job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
+    minute: "*/5"
+    hour: "*"
+    enabled: "{{ deadman_enabled }}"
+
  - name: "Muda audit"
    job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
    minute: "0"
--- a/ansible/roles/cron_manager/tasks/main.yml
+++ b/ansible/roles/cron_manager/tasks/main.yml
@@ -20,17 +20,6 @@
  loop: "{{ cron_jobs }}"
  when: cron_jobs is defined

- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)"
-  cron:
-    name: "Deadman switch — {{ wizard_name }}"
-    job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
-    minute: "*/5"
-    hour: "*"
-    state: present
-    user: "{{ ansible_user | default('root') }}"
-  when: deadman_enabled and machine_type != 'vps'
-  # VPS machines use systemd timers instead
-
 - name: "Remove legacy cron jobs (cleanup)"
  cron:
    name: "{{ item }}"
--- a/ansible/roles/deadman_switch/tasks/main.yml
+++ b/ansible/roles/deadman_switch/tasks/main.yml
@@ -19,30 +19,6 @@
    dest: "{{ wizard_home }}/deadman_action.sh"
    mode: "0755"

- name: "Deploy deadman systemd service"
-  template:
-    src: deadman_switch.service.j2
-    dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service"
-    mode: "0644"
-  when: machine_type == 'vps'
-  notify: "Enable deadman service"
-
- name: "Deploy deadman systemd timer"
-  template:
-    src: deadman_switch.timer.j2
-    dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer"
-    mode: "0644"
-  when: machine_type == 'vps'
-  notify: "Enable deadman timer"
-
- name: "Deploy deadman launchd plist (Mac)"
-  template:
-    src: deadman_switch.plist.j2
-    dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
-    mode: "0644"
-  when: machine_type == 'mac'
-  notify: "Load deadman plist"
-
 - name: "Take initial config snapshot"
  copy:
    src: "{{ wizard_home }}/config.yaml"
--- a/ansible/roles/deadman_switch/templates/deadman_action.sh.j2
+++ b/ansible/roles/deadman_switch/templates/deadman_action.sh.j2
@@ -54,10 +54,18 @@ snapshot_config() {

 rollback_config() {
    if [ -f "${SNAPSHOT_FILE}" ]; then
+        # Compute hashes for rollback audit
+        bad_hash="unknown"
+        good_hash="unknown"
+        if [ -f "${CONFIG_FILE}" ]; then
+            bad_hash=$(sha256sum "${CONFIG_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
+        fi
+        good_hash=$(sha256sum "${SNAPSHOT_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
+        log "Rollback event: agent=${WIZARD_NAME} old_hash=${bad_hash} new_hash=${good_hash}"
        log "Rolling back config to last known good..."
        cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
        log "Config rolled back."
-        log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
+        log_telemetry "fallback" "Config rolled back: old_hash=${bad_hash}, new_hash=${good_hash}"
    else
        log "ERROR: No known good snapshot found. Pulling from upstream..."
        cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
--- a/bin/deadman-fallback.py
+++ b/bin/deadman-fallback.py
@@ -1,263 +0,0 @@
-#!/usr/bin/env python3
-"""
-Dead Man Switch Fallback Engine
-
-When the dead man switch triggers (zero commits for 2+ hours, model down,
-Gitea unreachable, etc.), this script diagnoses the failure and applies
-common sense fallbacks automatically.
-
-Fallback chain:
-1. Primary model (Kimi) down -> switch config to local-llama.cpp
-2. Gitea unreachable -> cache issues locally, retry on recovery
-3. VPS agents down -> alert + lazarus protocol
-4. Local llama.cpp down -> try Ollama, then alert-only mode
-5. All inference dead -> safe mode (cron pauses, alert Alexander)
-
-Each fallback is reversible. Recovery auto-restores the previous config.
-"""
-import os
-import sys
-import json
-import subprocess
-import time
-import yaml
-import shutil
-from pathlib import Path
-from datetime import datetime, timedelta
-
-HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
-CONFIG_PATH = HERMES_HOME / "config.yaml"
-FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
-BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
-FORGE_URL = "https://forge.alexanderwhitestone.com"
-
-def load_config():
-    with open(CONFIG_PATH) as f:
-        return yaml.safe_load(f)
-
-def save_config(cfg):
-    with open(CONFIG_PATH, "w") as f:
-        yaml.dump(cfg, f, default_flow_style=False)
-
-def load_state():
-    if FALLBACK_STATE.exists():
-        with open(FALLBACK_STATE) as f:
-            return json.load(f)
-    return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
-
-def save_state(state):
-    state["last_check"] = datetime.now().isoformat()
-    with open(FALLBACK_STATE, "w") as f:
-        json.dump(state, f, indent=2)
-
-def run(cmd, timeout=10):
-    try:
-        r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
-        return r.returncode, r.stdout.strip(), r.stderr.strip()
-    except subprocess.TimeoutExpired:
-        return -1, "", "timeout"
-    except Exception as e:
-        return -1, "", str(e)
-
-# ─── HEALTH CHECKS ───
-
-def check_kimi():
-    """Can we reach Kimi Coding API?"""
-    key = os.environ.get("KIMI_API_KEY", "")
-    if not key:
-        # Check multiple .env locations
-        for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
-            if env_path.exists():
-                for line in open(env_path):
-                    line = line.strip()
-                    if line.startswith("KIMI_API_KEY="):
-                        key = line.split("=", 1)[1].strip().strip('"').strip("'")
-                        break
-            if key:
-                break
-    if not key:
-        return False, "no API key"
-    code, out, err = run(
-        f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
-        f'-H "x-api-provider: kimi-coding" '
-        f'https://api.kimi.com/coding/v1/models -X POST '
-        f'-H "content-type: application/json" '
-        f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
-        timeout=15
-    )
-    if code == 0 and out in ("200", "429"):
-        return True, f"HTTP {out}"
-    return False, f"HTTP {out} err={err[:80]}"
-
-def check_local_llama():
-    """Is local llama.cpp serving?"""
-    code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
-    if code == 0 and "hermes" in out.lower():
-        return True, "serving"
-    return False, f"exit={code}"
-
-def check_ollama():
-    """Is Ollama running?"""
-    code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
-    if code == 0 and "models" in out:
-        return True, "running"
-    return False, f"exit={code}"
-
-def check_gitea():
-    """Can we reach the Forge?"""
-    token_path = Path.home() / ".config" / "gitea" / "timmy-token"
-    if not token_path.exists():
-        return False, "no token"
-    token = token_path.read_text().strip()
-    code, out, err = run(
-        f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
-        f'"{FORGE_URL}/api/v1/user"',
-        timeout=10
-    )
-    if code == 0 and out == "200":
-        return True, "reachable"
-    return False, f"HTTP {out}"
-
-def check_vps(ip, name):
-    """Can we SSH into a VPS?"""
-    code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
-    if code == 0 and "alive" in out:
-        return True, "alive"
-    return False, f"unreachable"
-
-# ─── FALLBACK ACTIONS ───
-
-def fallback_to_local_model(cfg):
-    """Switch primary model from Kimi to local llama.cpp"""
-    if not BACKUP_CONFIG.exists():
-        shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
-    
-    cfg["model"]["provider"] = "local-llama.cpp"
-    cfg["model"]["default"] = "hermes3"
-    save_config(cfg)
-    return "Switched primary model to local-llama.cpp/hermes3"
-
-def fallback_to_ollama(cfg):
-    """Switch to Ollama if llama.cpp is also down"""
-    if not BACKUP_CONFIG.exists():
-        shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
-    
-    cfg["model"]["provider"] = "ollama"
-    cfg["model"]["default"] = "gemma4:latest"
-    save_config(cfg)
-    return "Switched primary model to ollama/gemma4:latest"
-
-def enter_safe_mode(state):
-    """Pause all non-essential cron jobs, alert Alexander"""
-    state["safe_mode"] = True
-    state["safe_mode_entered"] = datetime.now().isoformat()
-    save_state(state)
-    return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
-
-def restore_config():
-    """Restore pre-fallback config when primary recovers"""
-    if BACKUP_CONFIG.exists():
-        shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
-        BACKUP_CONFIG.unlink()
-        return "Restored original config from backup"
-    return "No backup config to restore"
-
-# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
-
-def diagnose_and_fallback():
-    state = load_state()
-    cfg = load_config()
-    
-    results = {
-        "timestamp": datetime.now().isoformat(),
-        "checks": {},
-        "actions": [],
-        "status": "healthy"
-    }
-    
-    # Check all systems
-    kimi_ok, kimi_msg = check_kimi()
-    results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
-    
-    llama_ok, llama_msg = check_local_llama()
-    results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
-    
-    ollama_ok, ollama_msg = check_ollama()
-    results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
-    
-    gitea_ok, gitea_msg = check_gitea()
-    results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
-    
-    # VPS checks
-    vpses = [
-        ("167.99.126.228", "Allegro"),
-        ("143.198.27.163", "Ezra"),
-        ("159.203.146.185", "Bezalel"),
-    ]
-    for ip, name in vpses:
-        vps_ok, vps_msg = check_vps(ip, name)
-        results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
-    
-    current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
-    
-    # ─── FALLBACK LOGIC ───
-    
-    # Case 1: Primary (Kimi) down, local available
-    if not kimi_ok and current_provider == "kimi-coding":
-        if llama_ok:
-            msg = fallback_to_local_model(cfg)
-            results["actions"].append(msg)
-            state["active_fallbacks"].append("kimi->local-llama")
-            results["status"] = "degraded_local"
-        elif ollama_ok:
-            msg = fallback_to_ollama(cfg)
-            results["actions"].append(msg)
-            state["active_fallbacks"].append("kimi->ollama")
-            results["status"] = "degraded_ollama"
-        else:
-            msg = enter_safe_mode(state)
-            results["actions"].append(msg)
-            results["status"] = "safe_mode"
-    
-    # Case 2: Already on fallback, check if primary recovered
-    elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
-        msg = restore_config()
-        results["actions"].append(msg)
-        state["active_fallbacks"].remove("kimi->local-llama")
-        results["status"] = "recovered"
-    elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
-        msg = restore_config()
-        results["actions"].append(msg)
-        state["active_fallbacks"].remove("kimi->ollama")
-        results["status"] = "recovered"
-    
-    # Case 3: Gitea down — just flag it, work locally
-    if not gitea_ok:
-        results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
-        if "gitea_down" not in state.get("active_fallbacks", []):
-            state["active_fallbacks"].append("gitea_down")
-        results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
-    elif "gitea_down" in state.get("active_fallbacks", []):
-        state["active_fallbacks"].remove("gitea_down")
-        results["actions"].append("Gitea recovered — resume normal operations")
-    
-    # Case 4: VPS agents down
-    for ip, name in vpses:
-        key = f"vps_{name.lower()}"
-        if not results["checks"][key]["ok"]:
-            results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
-    
-    save_state(state)
-    return results
-
-if __name__ == "__main__":
-    results = diagnose_and_fallback()
-    print(json.dumps(results, indent=2))
-    
-    # Exit codes for cron integration
-    if results["status"] == "safe_mode":
-        sys.exit(2)
-    elif results["status"].startswith("degraded"):
-        sys.exit(1)
-    else:
-        sys.exit(0)
--- a/bin/deadman-switch.sh
+++ b/bin/deadman-switch.sh
@@ -1,78 +0,0 @@
-#!/usr/bin/env bash
-# deadman-switch.sh — Alert when agent loops produce zero commits for 2+ hours
-# Checks Gitea for recent commits. Sends Telegram alert if threshold exceeded.
-# Designed to run as a cron job every 30 minutes.
-
-set -euo pipefail
-
-THRESHOLD_HOURS="${1:-2}"
-THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600))
-LOG_DIR="$HOME/.hermes/logs"
-LOG_FILE="$LOG_DIR/deadman.log"
-GITEA_URL="https://forge.alexanderwhitestone.com"
-GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "")
-TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "")
-TELEGRAM_CHAT="-1003664764329"
-
-REPOS=(
-  "Timmy_Foundation/timmy-config"
-  "Timmy_Foundation/the-nexus"
-)
-
-mkdir -p "$LOG_DIR"
-
-log() {
-  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
-}
-
-now=$(date +%s)
-latest_commit_time=0
-
-for repo in "${REPOS[@]}"; do
-  # Get most recent commit timestamp
-  response=$(curl -sf --max-time 10 \
-    -H "Authorization: token ${GITEA_TOKEN}" \
-    "${GITEA_URL}/api/v1/repos/${repo}/commits?limit=1" 2>/dev/null || echo "[]")
-
-  commit_date=$(echo "$response" | python3 -c "
-import json, sys, datetime
-try:
-    commits = json.load(sys.stdin)
-    if commits:
-        ts = commits[0]['created']
-        dt = datetime.datetime.fromisoformat(ts.replace('Z', '+00:00'))
-        print(int(dt.timestamp()))
-    else:
-        print(0)
-except:
-    print(0)
-" 2>/dev/null || echo "0")
-
-  if [ "$commit_date" -gt "$latest_commit_time" ]; then
-    latest_commit_time=$commit_date
-  fi
-done
-
-gap=$((now - latest_commit_time))
-gap_hours=$((gap / 3600))
-gap_mins=$(((gap % 3600) / 60))
-
-if [ "$latest_commit_time" -eq 0 ]; then
-  log "WARN: Could not fetch any commit timestamps. API may be down."
-  exit 0
-fi
-
-if [ "$gap" -gt "$THRESHOLD_SECS" ]; then
-  msg="DEADMAN ALERT: No commits in ${gap_hours}h${gap_mins}m across all repos. Loops may be dead. Last commit: $(date -r "$latest_commit_time" '+%Y-%m-%d %H:%M' 2>/dev/null || echo 'unknown')"
-  log "ALERT: $msg"
-
-  # Send Telegram alert
-  if [ -n "$TELEGRAM_TOKEN" ]; then
-    curl -sf --max-time 10 -X POST \
-      "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
-      -d "chat_id=${TELEGRAM_CHAT}" \
-      -d "text=${msg}" >/dev/null 2>&1 || true
-  fi
-else
-  log "OK: Last commit ${gap_hours}h${gap_mins}m ago (threshold: ${THRESHOLD_HOURS}h)"
-fi