#!/usr/bin/env bash # model-health-check.sh — Validate all configured model tags before loop startup # Reads config.yaml, extracts model tags, tests each against its provider API. # Exit 1 if primary model is dead. Warnings for auxiliary models. set -euo pipefail CONFIG="${HERMES_HOME:-$HOME/.hermes}/config.yaml" LOG_DIR="$HOME/.hermes/logs" LOG_FILE="$LOG_DIR/model-health.log" mkdir -p "$LOG_DIR" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE" } PASS=0 FAIL=0 WARN=0 check_anthropic_model() { local model="$1" local label="$2" local api_key="${ANTHROPIC_API_KEY:-}" if [ -z "$api_key" ]; then # Try loading from .env api_key=$(grep '^ANTHROPIC_API_KEY=' "${HERMES_HOME:-$HOME/.hermes}/.env" 2>/dev/null | head -1 | cut -d= -f2- | tr -d "'\"" || echo "") fi if [ -z "$api_key" ]; then log "SKIP [$label] $model -- no ANTHROPIC_API_KEY" return 0 fi response=$(curl -sf --max-time 10 -X POST \ "https://api.anthropic.com/v1/messages" \ -H "x-api-key: ${api_key}" \ -H "anthropic-version: 2023-06-01" \ -H "content-type: application/json" \ -d "{\"model\":\"${model}\",\"max_tokens\":1,\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}" 2>&1 || echo "ERROR") if echo "$response" | grep -q '"not_found_error"'; then log "FAIL [$label] $model -- model not found (404)" return 1 elif echo "$response" | grep -q '"rate_limit_error"\|"overloaded_error"'; then log "PASS [$label] $model -- rate limited but model exists" return 0 elif echo "$response" | grep -q '"content"'; then log "PASS [$label] $model -- healthy" return 0 elif echo "$response" | grep -q 'ERROR'; then log "WARN [$label] $model -- could not reach API" return 2 else log "PASS [$label] $model -- responded (non-404)" return 0 fi } # Extract models from config log "=== Model Health Check ===" # Primary model primary=$(python3 -c " import yaml with open('$CONFIG') as f: c = yaml.safe_load(f) m = c.get('model', {}) if isinstance(m, dict): print(m.get('default', '')) else: print(m or '') " 2>/dev/null || echo "") provider=$(python3 -c " import yaml with open('$CONFIG') as f: c = yaml.safe_load(f) m = c.get('model', {}) if isinstance(m, dict): print(m.get('provider', '')) else: print('') " 2>/dev/null || echo "") if [ -n "$primary" ] && [ "$provider" = "anthropic" ]; then if check_anthropic_model "$primary" "PRIMARY"; then PASS=$((PASS + 1)) else rc=$? if [ "$rc" -eq 1 ]; then FAIL=$((FAIL + 1)) log "CRITICAL: Primary model $primary is DEAD. Loops will fail." log "Known good alternatives: claude-opus-4.6, claude-haiku-4-5-20251001" else WARN=$((WARN + 1)) fi fi elif [ -n "$primary" ]; then log "SKIP [PRIMARY] $primary -- non-anthropic provider ($provider), no validator yet" fi # Cron model check (haiku) CRON_MODEL="claude-haiku-4-5-20251001" if check_anthropic_model "$CRON_MODEL" "CRON"; then PASS=$((PASS + 1)) else rc=$? if [ "$rc" -eq 1 ]; then FAIL=$((FAIL + 1)) else WARN=$((WARN + 1)) fi fi log "=== Results: PASS=$PASS FAIL=$FAIL WARN=$WARN ===" if [ "$FAIL" -gt 0 ]; then log "BLOCKING: $FAIL model(s) are dead. Fix config before starting loops." exit 1 fi exit 0