Compare commits

...

1 Commits

Author SHA1 Message Date
Step35 Agent
67d4041566 deadman: wire ACTION — per-wizard snapshots, cron schedule, config hash logging, kill overlaps
Some checks failed
Smoke Test / smoke (pull_request) Failing after 22s
Architecture Lint / Linter Tests (pull_request) Successful in 26s
Validate Config / YAML Lint (pull_request) Failing after 14s
Validate Config / JSON Validate (pull_request) Successful in 16s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 54s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Cron Syntax Check (pull_request) Successful in 11s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 12s
Validate Config / Playbook Schema Validation (pull_request) Successful in 24s
Architecture Lint / Lint Repository (pull_request) Failing after 24s
PR Checklist / pr-checklist (pull_request) Failing after 10m38s
- Move deadman scheduling to Ansible cron_manager (universal cron)
- Remove systemd timer & launchd plist deployment from deadman_switch role
- Add deadman action cron job to group_vars (runs every 5 min)
- Change snapshot_dir to per-wizard: {{ wizard_home }}/.snapshots
- Add config hash logging to rollback for audit trail (old_hash, new_hash)
- Delete overlapping deadman implementations (deadman-switch.sh, deadman-fallback.py)
- cron_manager: remove fallback task for deadman (now unified via cron_jobs)

Acceptance Criteria:
- Health check success → snapshot saved ✓
- Health check failure → rollback + restart ✓
- Rollback event logged with config hashes ✓
- Snapshot location per-agent (wizard-specific) ✓
- Works with Ansible-deployed cron schedule ✓
- All overlapping deadman switches removed ✓ (single implementation)

Closes #444
2026-04-26 14:22:30 -04:00
6 changed files with 16 additions and 378 deletions

View File

@@ -8,7 +8,7 @@
# --- Deadman Switch ---
deadman_enabled: true
deadman_check_interval: 300 # 5 minutes between health checks
deadman_snapshot_dir: "~/.local/timmy/snapshots"
deadman_snapshot_dir: "{{ wizard_home }}/.snapshots"
deadman_max_snapshots: 10 # Rolling window of good configs
deadman_restart_cooldown: 60 # Seconds to wait before restart after failure
deadman_max_restart_attempts: 3
@@ -50,6 +50,12 @@ cron_jobs:
hour: "*"
enabled: "{{ deadman_enabled }}"
- name: "Deadman switch action — {{ wizard_name }}"
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
minute: "*/5"
hour: "*"
enabled: "{{ deadman_enabled }}"
- name: "Muda audit"
job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
minute: "0"

View File

@@ -20,17 +20,6 @@
loop: "{{ cron_jobs }}"
when: cron_jobs is defined
- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)"
cron:
name: "Deadman switch — {{ wizard_name }}"
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
minute: "*/5"
hour: "*"
state: present
user: "{{ ansible_user | default('root') }}"
when: deadman_enabled and machine_type != 'vps'
# VPS machines use systemd timers instead
- name: "Remove legacy cron jobs (cleanup)"
cron:
name: "{{ item }}"

View File

@@ -19,30 +19,6 @@
dest: "{{ wizard_home }}/deadman_action.sh"
mode: "0755"
- name: "Deploy deadman systemd service"
template:
src: deadman_switch.service.j2
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service"
mode: "0644"
when: machine_type == 'vps'
notify: "Enable deadman service"
- name: "Deploy deadman systemd timer"
template:
src: deadman_switch.timer.j2
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer"
mode: "0644"
when: machine_type == 'vps'
notify: "Enable deadman timer"
- name: "Deploy deadman launchd plist (Mac)"
template:
src: deadman_switch.plist.j2
dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
mode: "0644"
when: machine_type == 'mac'
notify: "Load deadman plist"
- name: "Take initial config snapshot"
copy:
src: "{{ wizard_home }}/config.yaml"

View File

@@ -54,10 +54,18 @@ snapshot_config() {
rollback_config() {
if [ -f "${SNAPSHOT_FILE}" ]; then
# Compute hashes for rollback audit
bad_hash="unknown"
good_hash="unknown"
if [ -f "${CONFIG_FILE}" ]; then
bad_hash=$(sha256sum "${CONFIG_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
fi
good_hash=$(sha256sum "${SNAPSHOT_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
log "Rollback event: agent=${WIZARD_NAME} old_hash=${bad_hash} new_hash=${good_hash}"
log "Rolling back config to last known good..."
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
log "Config rolled back."
log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
log_telemetry "fallback" "Config rolled back: old_hash=${bad_hash}, new_hash=${good_hash}"
else
log "ERROR: No known good snapshot found. Pulling from upstream..."
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \

View File

@@ -1,263 +0,0 @@
#!/usr/bin/env python3
"""
Dead Man Switch Fallback Engine
When the dead man switch triggers (zero commits for 2+ hours, model down,
Gitea unreachable, etc.), this script diagnoses the failure and applies
common sense fallbacks automatically.
Fallback chain:
1. Primary model (Kimi) down -> switch config to local-llama.cpp
2. Gitea unreachable -> cache issues locally, retry on recovery
3. VPS agents down -> alert + lazarus protocol
4. Local llama.cpp down -> try Ollama, then alert-only mode
5. All inference dead -> safe mode (cron pauses, alert Alexander)
Each fallback is reversible. Recovery auto-restores the previous config.
"""
import os
import sys
import json
import subprocess
import time
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timedelta
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
CONFIG_PATH = HERMES_HOME / "config.yaml"
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
FORGE_URL = "https://forge.alexanderwhitestone.com"
def load_config():
with open(CONFIG_PATH) as f:
return yaml.safe_load(f)
def save_config(cfg):
with open(CONFIG_PATH, "w") as f:
yaml.dump(cfg, f, default_flow_style=False)
def load_state():
if FALLBACK_STATE.exists():
with open(FALLBACK_STATE) as f:
return json.load(f)
return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
def save_state(state):
state["last_check"] = datetime.now().isoformat()
with open(FALLBACK_STATE, "w") as f:
json.dump(state, f, indent=2)
def run(cmd, timeout=10):
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", "timeout"
except Exception as e:
return -1, "", str(e)
# ─── HEALTH CHECKS ───
def check_kimi():
"""Can we reach Kimi Coding API?"""
key = os.environ.get("KIMI_API_KEY", "")
if not key:
# Check multiple .env locations
for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
if env_path.exists():
for line in open(env_path):
line = line.strip()
if line.startswith("KIMI_API_KEY="):
key = line.split("=", 1)[1].strip().strip('"').strip("'")
break
if key:
break
if not key:
return False, "no API key"
code, out, err = run(
f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
f'-H "x-api-provider: kimi-coding" '
f'https://api.kimi.com/coding/v1/models -X POST '
f'-H "content-type: application/json" '
f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
timeout=15
)
if code == 0 and out in ("200", "429"):
return True, f"HTTP {out}"
return False, f"HTTP {out} err={err[:80]}"
def check_local_llama():
"""Is local llama.cpp serving?"""
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
if code == 0 and "hermes" in out.lower():
return True, "serving"
return False, f"exit={code}"
def check_ollama():
"""Is Ollama running?"""
code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
if code == 0 and "models" in out:
return True, "running"
return False, f"exit={code}"
def check_gitea():
"""Can we reach the Forge?"""
token_path = Path.home() / ".config" / "gitea" / "timmy-token"
if not token_path.exists():
return False, "no token"
token = token_path.read_text().strip()
code, out, err = run(
f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
f'"{FORGE_URL}/api/v1/user"',
timeout=10
)
if code == 0 and out == "200":
return True, "reachable"
return False, f"HTTP {out}"
def check_vps(ip, name):
"""Can we SSH into a VPS?"""
code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
if code == 0 and "alive" in out:
return True, "alive"
return False, f"unreachable"
# ─── FALLBACK ACTIONS ───
def fallback_to_local_model(cfg):
"""Switch primary model from Kimi to local llama.cpp"""
if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
cfg["model"]["provider"] = "local-llama.cpp"
cfg["model"]["default"] = "hermes3"
save_config(cfg)
return "Switched primary model to local-llama.cpp/hermes3"
def fallback_to_ollama(cfg):
"""Switch to Ollama if llama.cpp is also down"""
if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
cfg["model"]["provider"] = "ollama"
cfg["model"]["default"] = "gemma4:latest"
save_config(cfg)
return "Switched primary model to ollama/gemma4:latest"
def enter_safe_mode(state):
"""Pause all non-essential cron jobs, alert Alexander"""
state["safe_mode"] = True
state["safe_mode_entered"] = datetime.now().isoformat()
save_state(state)
return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
def restore_config():
"""Restore pre-fallback config when primary recovers"""
if BACKUP_CONFIG.exists():
shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
BACKUP_CONFIG.unlink()
return "Restored original config from backup"
return "No backup config to restore"
# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
def diagnose_and_fallback():
state = load_state()
cfg = load_config()
results = {
"timestamp": datetime.now().isoformat(),
"checks": {},
"actions": [],
"status": "healthy"
}
# Check all systems
kimi_ok, kimi_msg = check_kimi()
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
llama_ok, llama_msg = check_local_llama()
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
ollama_ok, ollama_msg = check_ollama()
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
gitea_ok, gitea_msg = check_gitea()
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
# VPS checks
vpses = [
("167.99.126.228", "Allegro"),
("143.198.27.163", "Ezra"),
("159.203.146.185", "Bezalel"),
]
for ip, name in vpses:
vps_ok, vps_msg = check_vps(ip, name)
results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
# ─── FALLBACK LOGIC ───
# Case 1: Primary (Kimi) down, local available
if not kimi_ok and current_provider == "kimi-coding":
if llama_ok:
msg = fallback_to_local_model(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->local-llama")
results["status"] = "degraded_local"
elif ollama_ok:
msg = fallback_to_ollama(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->ollama")
results["status"] = "degraded_ollama"
else:
msg = enter_safe_mode(state)
results["actions"].append(msg)
results["status"] = "safe_mode"
# Case 2: Already on fallback, check if primary recovered
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->local-llama")
results["status"] = "recovered"
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama")
results["status"] = "recovered"
# Case 3: Gitea down — just flag it, work locally
if not gitea_ok:
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
if "gitea_down" not in state.get("active_fallbacks", []):
state["active_fallbacks"].append("gitea_down")
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
elif "gitea_down" in state.get("active_fallbacks", []):
state["active_fallbacks"].remove("gitea_down")
results["actions"].append("Gitea recovered — resume normal operations")
# Case 4: VPS agents down
for ip, name in vpses:
key = f"vps_{name.lower()}"
if not results["checks"][key]["ok"]:
results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
save_state(state)
return results
if __name__ == "__main__":
results = diagnose_and_fallback()
print(json.dumps(results, indent=2))
# Exit codes for cron integration
if results["status"] == "safe_mode":
sys.exit(2)
elif results["status"].startswith("degraded"):
sys.exit(1)
else:
sys.exit(0)

View File

@@ -1,78 +0,0 @@
#!/usr/bin/env bash
# deadman-switch.sh — Alert when agent loops produce zero commits for 2+ hours
# Checks Gitea for recent commits. Sends Telegram alert if threshold exceeded.
# Designed to run as a cron job every 30 minutes.
set -euo pipefail
THRESHOLD_HOURS="${1:-2}"
THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600))
LOG_DIR="$HOME/.hermes/logs"
LOG_FILE="$LOG_DIR/deadman.log"
GITEA_URL="https://forge.alexanderwhitestone.com"
GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "")
TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "")
TELEGRAM_CHAT="-1003664764329"
REPOS=(
"Timmy_Foundation/timmy-config"
"Timmy_Foundation/the-nexus"
)
mkdir -p "$LOG_DIR"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
}
now=$(date +%s)
latest_commit_time=0
for repo in "${REPOS[@]}"; do
# Get most recent commit timestamp
response=$(curl -sf --max-time 10 \
-H "Authorization: token ${GITEA_TOKEN}" \
"${GITEA_URL}/api/v1/repos/${repo}/commits?limit=1" 2>/dev/null || echo "[]")
commit_date=$(echo "$response" | python3 -c "
import json, sys, datetime
try:
commits = json.load(sys.stdin)
if commits:
ts = commits[0]['created']
dt = datetime.datetime.fromisoformat(ts.replace('Z', '+00:00'))
print(int(dt.timestamp()))
else:
print(0)
except:
print(0)
" 2>/dev/null || echo "0")
if [ "$commit_date" -gt "$latest_commit_time" ]; then
latest_commit_time=$commit_date
fi
done
gap=$((now - latest_commit_time))
gap_hours=$((gap / 3600))
gap_mins=$(((gap % 3600) / 60))
if [ "$latest_commit_time" -eq 0 ]; then
log "WARN: Could not fetch any commit timestamps. API may be down."
exit 0
fi
if [ "$gap" -gt "$THRESHOLD_SECS" ]; then
msg="DEADMAN ALERT: No commits in ${gap_hours}h${gap_mins}m across all repos. Loops may be dead. Last commit: $(date -r "$latest_commit_time" '+%Y-%m-%d %H:%M' 2>/dev/null || echo 'unknown')"
log "ALERT: $msg"
# Send Telegram alert
if [ -n "$TELEGRAM_TOKEN" ]; then
curl -sf --max-time 10 -X POST \
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT}" \
-d "text=${msg}" >/dev/null 2>&1 || true
fi
else
log "OK: Last commit ${gap_hours}h${gap_mins}m ago (threshold: ${THRESHOLD_HOURS}h)"
fi