Some checks failed
PR Checklist / pr-checklist (pull_request) Failing after 1m27s
Implements the Ansible Infrastructure as Code story from KT 2026-04-08. One canonical Ansible playbook defines: - Deadman switch (snapshot good config on health, rollback+restart on death) - Golden state config deployment (Anthropic BANNED, Kimi→Gemini→Ollama) - Cron schedule (source-controlled, no manual crontab edits) - Agent startup sequence (pull→validate→start→verify) - request_log telemetry table (every inference call logged) - Thin config pattern (immutable local pointer to upstream) - Gitea webhook handler (deploy on merge) - Config validator (rejects banned providers) Fleet inventory: Timmy (Mac), Allegro (VPS), Bezalel (VPS), Ezra (VPS) Roles: wizard_base, golden_state, deadman_switch, request_log, cron_manager Addresses: timmy-config #442, #443, #444, #445, #446 References: KT Final 2026-04-08 P2, KT Bezalel 2026-04-08 #1-#5
154 lines
5.2 KiB
Django/Jinja
154 lines
5.2 KiB
Django/Jinja
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# Deadman Switch ACTION — {{ wizard_name }}
|
|
# =============================================================================
|
|
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
|
|
# DO NOT EDIT MANUALLY.
|
|
#
|
|
# On healthy check: snapshot current config as "last known good"
|
|
# On failed check: rollback config to last known good, restart agent
|
|
# =============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
WIZARD_NAME="{{ wizard_name }}"
|
|
WIZARD_HOME="{{ wizard_home }}"
|
|
CONFIG_FILE="{{ wizard_home }}/config.yaml"
|
|
SNAPSHOT_DIR="{{ deadman_snapshot_dir }}"
|
|
SNAPSHOT_FILE="${SNAPSHOT_DIR}/config.yaml.known_good"
|
|
REQUEST_LOG_DB="{{ request_log_path }}"
|
|
LOG_DIR="{{ timmy_log_dir }}"
|
|
LOG_FILE="${LOG_DIR}/deadman-${WIZARD_NAME}.log"
|
|
MAX_SNAPSHOTS={{ deadman_max_snapshots }}
|
|
RESTART_COOLDOWN={{ deadman_restart_cooldown }}
|
|
MAX_RESTART_ATTEMPTS={{ deadman_max_restart_attempts }}
|
|
COOLDOWN_FILE="${LOG_DIR}/deadman_cooldown_${WIZARD_NAME}"
|
|
SERVICE_NAME="hermes-{{ wizard_name | lower }}"
|
|
|
|
# Ensure directories exist
|
|
mkdir -p "${SNAPSHOT_DIR}" "${LOG_DIR}"
|
|
|
|
log() {
|
|
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [deadman] [${WIZARD_NAME}] $*" >> "${LOG_FILE}"
|
|
echo "[deadman] [${WIZARD_NAME}] $*"
|
|
}
|
|
|
|
log_telemetry() {
|
|
local status="$1"
|
|
local message="$2"
|
|
if [ -f "${REQUEST_LOG_DB}" ]; then
|
|
sqlite3 "${REQUEST_LOG_DB}" "INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) VALUES (datetime('now'), '${WIZARD_NAME}', 'deadman_switch', 'N/A', 'health_check', '${status}', '${message}');" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
snapshot_config() {
|
|
if [ -f "${CONFIG_FILE}" ]; then
|
|
cp "${CONFIG_FILE}" "${SNAPSHOT_FILE}"
|
|
# Keep rolling history
|
|
cp "${CONFIG_FILE}" "${SNAPSHOT_DIR}/config.yaml.$(date +%s)"
|
|
# Prune old snapshots
|
|
ls -t "${SNAPSHOT_DIR}"/config.yaml.[0-9]* 2>/dev/null | tail -n +$((MAX_SNAPSHOTS + 1)) | xargs rm -f 2>/dev/null
|
|
log "Config snapshot saved."
|
|
fi
|
|
}
|
|
|
|
rollback_config() {
|
|
if [ -f "${SNAPSHOT_FILE}" ]; then
|
|
log "Rolling back config to last known good..."
|
|
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
|
|
log "Config rolled back."
|
|
log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
|
|
else
|
|
log "ERROR: No known good snapshot found. Pulling from upstream..."
|
|
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
|
|
git pull --ff-only origin {{ upstream_branch }} 2>/dev/null && \
|
|
cp "wizards/{{ wizard_name | lower }}/config.yaml" "${CONFIG_FILE}" && \
|
|
log "Config restored from upstream." || \
|
|
log "CRITICAL: Cannot restore config from any source."
|
|
fi
|
|
}
|
|
|
|
restart_agent() {
|
|
# Check cooldown
|
|
if [ -f "${COOLDOWN_FILE}" ]; then
|
|
local last_restart
|
|
last_restart=$(cat "${COOLDOWN_FILE}")
|
|
local now
|
|
now=$(date +%s)
|
|
local elapsed=$((now - last_restart))
|
|
if [ "${elapsed}" -lt "${RESTART_COOLDOWN}" ]; then
|
|
log "Restart cooldown active (${elapsed}s / ${RESTART_COOLDOWN}s). Skipping."
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
log "Restarting ${SERVICE_NAME}..."
|
|
date +%s > "${COOLDOWN_FILE}"
|
|
|
|
{% if machine_type == 'vps' %}
|
|
systemctl restart "${SERVICE_NAME}" 2>/dev/null && \
|
|
log "Agent restarted via systemd." || \
|
|
log "ERROR: systemd restart failed."
|
|
{% else %}
|
|
launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null && \
|
|
log "Agent restarted via launchctl." || \
|
|
(cd "${WIZARD_HOME}" && hermes agent start --daemon 2>/dev/null && \
|
|
log "Agent restarted via hermes CLI.") || \
|
|
log "ERROR: All restart methods failed."
|
|
{% endif %}
|
|
|
|
log_telemetry "success" "Agent restarted by deadman switch"
|
|
}
|
|
|
|
# --- Health Check ---
|
|
check_health() {
|
|
# Check 1: Is the agent process running?
|
|
{% if machine_type == 'vps' %}
|
|
if ! systemctl is-active --quiet "${SERVICE_NAME}" 2>/dev/null; then
|
|
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
|
|
log "FAIL: Agent process not running."
|
|
return 1
|
|
fi
|
|
fi
|
|
{% else %}
|
|
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
|
|
log "FAIL: Agent process not running."
|
|
return 1
|
|
fi
|
|
{% endif %}
|
|
|
|
# Check 2: Is the API port responding?
|
|
if ! timeout 10 bash -c "echo > /dev/tcp/127.0.0.1/{{ api_port }}" 2>/dev/null; then
|
|
log "FAIL: API port {{ api_port }} not responding."
|
|
return 1
|
|
fi
|
|
|
|
# Check 3: Does the config contain banned providers?
|
|
if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "${CONFIG_FILE}" 2>/dev/null; then
|
|
log "FAIL: Config contains banned provider (Anthropic). Rolling back."
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- Main ---
|
|
main() {
|
|
log "Health check starting..."
|
|
|
|
if check_health; then
|
|
log "HEALTHY — snapshotting config."
|
|
snapshot_config
|
|
log_telemetry "success" "Health check passed"
|
|
else
|
|
log "UNHEALTHY — initiating recovery."
|
|
log_telemetry "error" "Health check failed — initiating rollback"
|
|
rollback_config
|
|
restart_agent
|
|
fi
|
|
|
|
log "Health check complete."
|
|
}
|
|
|
|
main "$@"
|