From 3cf9f0de5efacbb09b21a3adb8bbf88da16d315c Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sat, 4 Apr 2026 12:00:05 -0400 Subject: [PATCH] feat(ops): deadman switch, model health check, issue filter Closes #115: bin/deadman-switch.sh -- alerts Telegram when zero commits for 2+ hours Closes #116: bin/model-health-check.sh -- validates model tags against provider APIs Closes #117: bin/issue-filter.json + live loop patches -- excludes DO-NOT-CLOSE, EPIC, META, RETRO, INTEL, MORNING REPORT, Rockachopa-assigned issues from agent pickup All three tested locally: - deadman-switch correctly detected 14h gap and would alert - model-health-check parses config.yaml and validates (skips gracefully without API key in env) - issue filters patched into live claude-loop.sh and gemini-loop.sh --- bin/deadman-switch.sh | 78 ++++++++++++++++++++++++ bin/issue-filter.json | 19 ++++++ bin/model-health-check.sh | 125 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 222 insertions(+) create mode 100755 bin/deadman-switch.sh create mode 100644 bin/issue-filter.json create mode 100755 bin/model-health-check.sh diff --git a/bin/deadman-switch.sh b/bin/deadman-switch.sh new file mode 100755 index 00000000..8d432a3f --- /dev/null +++ b/bin/deadman-switch.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# deadman-switch.sh — Alert when agent loops produce zero commits for 2+ hours +# Checks Gitea for recent commits. Sends Telegram alert if threshold exceeded. +# Designed to run as a cron job every 30 minutes. + +set -euo pipefail + +THRESHOLD_HOURS="${1:-2}" +THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600)) +LOG_DIR="$HOME/.hermes/logs" +LOG_FILE="$LOG_DIR/deadman.log" +GITEA_URL="http://143.198.27.163:3000" +GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "") +TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "") +TELEGRAM_CHAT="-1003664764329" + +REPOS=( + "Timmy_Foundation/timmy-config" + "Timmy_Foundation/the-nexus" +) + +mkdir -p "$LOG_DIR" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE" +} + +now=$(date +%s) +latest_commit_time=0 + +for repo in "${REPOS[@]}"; do + # Get most recent commit timestamp + response=$(curl -sf --max-time 10 \ + -H "Authorization: token ${GITEA_TOKEN}" \ + "${GITEA_URL}/api/v1/repos/${repo}/commits?limit=1" 2>/dev/null || echo "[]") + + commit_date=$(echo "$response" | python3 -c " +import json, sys, datetime +try: + commits = json.load(sys.stdin) + if commits: + ts = commits[0]['created'] + dt = datetime.datetime.fromisoformat(ts.replace('Z', '+00:00')) + print(int(dt.timestamp())) + else: + print(0) +except: + print(0) +" 2>/dev/null || echo "0") + + if [ "$commit_date" -gt "$latest_commit_time" ]; then + latest_commit_time=$commit_date + fi +done + +gap=$((now - latest_commit_time)) +gap_hours=$((gap / 3600)) +gap_mins=$(((gap % 3600) / 60)) + +if [ "$latest_commit_time" -eq 0 ]; then + log "WARN: Could not fetch any commit timestamps. API may be down." + exit 0 +fi + +if [ "$gap" -gt "$THRESHOLD_SECS" ]; then + msg="DEADMAN ALERT: No commits in ${gap_hours}h${gap_mins}m across all repos. Loops may be dead. Last commit: $(date -r "$latest_commit_time" '+%Y-%m-%d %H:%M' 2>/dev/null || echo 'unknown')" + log "ALERT: $msg" + + # Send Telegram alert + if [ -n "$TELEGRAM_TOKEN" ]; then + curl -sf --max-time 10 -X POST \ + "https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \ + -d "chat_id=${TELEGRAM_CHAT}" \ + -d "text=${msg}" >/dev/null 2>&1 || true + fi +else + log "OK: Last commit ${gap_hours}h${gap_mins}m ago (threshold: ${THRESHOLD_HOURS}h)" +fi diff --git a/bin/issue-filter.json b/bin/issue-filter.json new file mode 100644 index 00000000..74638152 --- /dev/null +++ b/bin/issue-filter.json @@ -0,0 +1,19 @@ +{ + "skip_title_patterns": [ + "[DO NOT CLOSE", + "[EPIC]", + "[META]", + "[GOVERNING]", + "[PERMANENT]", + "[MORNING REPORT]", + "[RETRO]", + "[INTEL]", + "[SHOWCASE]", + "[PHILOSOPHY]", + "Master Escalation" + ], + "skip_assignees": [ + "Rockachopa" + ], + "comment": "Shared filter config for agent loops. Loaded by claude-loop.sh and gemini-loop.sh at issue selection time." +} diff --git a/bin/model-health-check.sh b/bin/model-health-check.sh new file mode 100755 index 00000000..1b163dce --- /dev/null +++ b/bin/model-health-check.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# model-health-check.sh — Validate all configured model tags before loop startup +# Reads config.yaml, extracts model tags, tests each against its provider API. +# Exit 1 if primary model is dead. Warnings for auxiliary models. + +set -euo pipefail + +CONFIG="${HERMES_HOME:-$HOME/.hermes}/config.yaml" +LOG_DIR="$HOME/.hermes/logs" +LOG_FILE="$LOG_DIR/model-health.log" + +mkdir -p "$LOG_DIR" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE" +} + +PASS=0 +FAIL=0 +WARN=0 + +check_anthropic_model() { + local model="$1" + local label="$2" + local api_key="${ANTHROPIC_API_KEY:-}" + + if [ -z "$api_key" ]; then + # Try loading from .env + api_key=$(grep '^ANTHROPIC_API_KEY=' "${HERMES_HOME:-$HOME/.hermes}/.env" 2>/dev/null | head -1 | cut -d= -f2- | tr -d "'\"" || echo "") + fi + + if [ -z "$api_key" ]; then + log "SKIP [$label] $model -- no ANTHROPIC_API_KEY" + return 0 + fi + + response=$(curl -sf --max-time 10 -X POST \ + "https://api.anthropic.com/v1/messages" \ + -H "x-api-key: ${api_key}" \ + -H "anthropic-version: 2023-06-01" \ + -H "content-type: application/json" \ + -d "{\"model\":\"${model}\",\"max_tokens\":1,\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}" 2>&1 || echo "ERROR") + + if echo "$response" | grep -q '"not_found_error"'; then + log "FAIL [$label] $model -- model not found (404)" + return 1 + elif echo "$response" | grep -q '"rate_limit_error"\|"overloaded_error"'; then + log "PASS [$label] $model -- rate limited but model exists" + return 0 + elif echo "$response" | grep -q '"content"'; then + log "PASS [$label] $model -- healthy" + return 0 + elif echo "$response" | grep -q 'ERROR'; then + log "WARN [$label] $model -- could not reach API" + return 2 + else + log "PASS [$label] $model -- responded (non-404)" + return 0 + fi +} + +# Extract models from config +log "=== Model Health Check ===" + +# Primary model +primary=$(python3 -c " +import yaml +with open('$CONFIG') as f: + c = yaml.safe_load(f) +m = c.get('model', {}) +if isinstance(m, dict): + print(m.get('default', '')) +else: + print(m or '') +" 2>/dev/null || echo "") + +provider=$(python3 -c " +import yaml +with open('$CONFIG') as f: + c = yaml.safe_load(f) +m = c.get('model', {}) +if isinstance(m, dict): + print(m.get('provider', '')) +else: + print('') +" 2>/dev/null || echo "") + +if [ -n "$primary" ] && [ "$provider" = "anthropic" ]; then + if check_anthropic_model "$primary" "PRIMARY"; then + PASS=$((PASS + 1)) + else + rc=$? + if [ "$rc" -eq 1 ]; then + FAIL=$((FAIL + 1)) + log "CRITICAL: Primary model $primary is DEAD. Loops will fail." + log "Known good alternatives: claude-opus-4.6, claude-haiku-4-5-20251001" + else + WARN=$((WARN + 1)) + fi + fi +elif [ -n "$primary" ]; then + log "SKIP [PRIMARY] $primary -- non-anthropic provider ($provider), no validator yet" +fi + +# Cron model check (haiku) +CRON_MODEL="claude-haiku-4-5-20251001" +if check_anthropic_model "$CRON_MODEL" "CRON"; then + PASS=$((PASS + 1)) +else + rc=$? + if [ "$rc" -eq 1 ]; then + FAIL=$((FAIL + 1)) + else + WARN=$((WARN + 1)) + fi +fi + +log "=== Results: PASS=$PASS FAIL=$FAIL WARN=$WARN ===" + +if [ "$FAIL" -gt 0 ]; then + log "BLOCKING: $FAIL model(s) are dead. Fix config before starting loops." + exit 1 +fi + +exit 0