Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 44s
Smoke Test / smoke (pull_request) Failing after 36s
Validate Config / YAML Lint (pull_request) Failing after 21s
Validate Config / JSON Validate (pull_request) Successful in 28s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 2m36s
Validate Config / Shell Script Lint (pull_request) Failing after 1m3s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 12s
PR Checklist / pr-checklist (pull_request) Failing after 6m15s
Validate Config / Playbook Schema Validation (pull_request) Successful in 28s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
- Tests all configured providers - Maintains health map in tmux-state.json - Auto-switches profiles to working providers - Supports --daemon and --status modes Closes #509
412 lines
14 KiB
Python
412 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Provider Health Monitor Script
|
|
Issue #509: [Robustness] Provider-aware profile config — auto-switch on failure
|
|
|
|
Monitors provider health and automatically switches profiles to working providers.
|
|
|
|
Usage:
|
|
python3 provider-health-monitor.py # Run once
|
|
python3 provider-health-monitor.py --daemon # Run continuously
|
|
python3 provider-health-monitor.py --status # Show provider health
|
|
"""
|
|
|
|
import os, sys, json, yaml, urllib.request, time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Configuration
|
|
HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
|
PROFILES_DIR = HERMES_HOME / "profiles"
|
|
LOG_DIR = Path.home() / ".local" / "timmy" / "fleet-health"
|
|
STATE_FILE = LOG_DIR / "tmux-state.json"
|
|
LOG_FILE = LOG_DIR / "provider-health.log"
|
|
|
|
# Provider test endpoints
|
|
PROVIDER_TESTS = {
|
|
"openrouter": {
|
|
"url": "https://openrouter.ai/api/v1/models",
|
|
"method": "GET",
|
|
"headers": lambda api_key: {"Authorization": "Bearer " + api_key},
|
|
"timeout": 10
|
|
},
|
|
"anthropic": {
|
|
"url": "https://api.anthropic.com/v1/models",
|
|
"method": "GET",
|
|
"headers": lambda api_key: {"x-api-key": api_key, "anthropic-version": "2023-06-01"},
|
|
"timeout": 10
|
|
},
|
|
"nous": {
|
|
"url": "https://inference.nousresearch.com/v1/models",
|
|
"method": "GET",
|
|
"headers": lambda api_key: {"Authorization": "Bearer " + api_key},
|
|
"timeout": 10
|
|
},
|
|
"kimi-coding": {
|
|
"url": "https://api.kimi.com/coding/v1/models",
|
|
"method": "GET",
|
|
"headers": lambda api_key: {"x-api-key": api_key, "x-api-provider": "kimi-coding"},
|
|
"timeout": 10
|
|
},
|
|
"ollama": {
|
|
"url": "http://localhost:11434/api/tags",
|
|
"method": "GET",
|
|
"headers": lambda api_key: {},
|
|
"timeout": 5
|
|
}
|
|
}
|
|
|
|
def log(msg):
|
|
"""Log message to file and optionally console."""
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
log_entry = "[" + timestamp + "] " + msg
|
|
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
with open(LOG_FILE, "a") as f:
|
|
f.write(log_entry + "\n")
|
|
|
|
if "--quiet" not in sys.argv:
|
|
print(log_entry)
|
|
|
|
def get_provider_api_key(provider):
|
|
"""Get API key for a provider from .env or environment."""
|
|
env_file = HERMES_HOME / ".env"
|
|
if env_file.exists():
|
|
with open(env_file) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line.startswith(provider.upper() + "_API_KEY="):
|
|
return line.split("=", 1)[1].strip().strip("'\"")
|
|
|
|
return os.environ.get(provider.upper() + "_API_KEY")
|
|
|
|
def test_provider(provider, api_key=None):
|
|
"""Test if a provider is healthy."""
|
|
config = PROVIDER_TESTS.get(provider)
|
|
if not config:
|
|
return False, "Unknown provider: " + provider
|
|
|
|
headers = config["headers"](api_key or "")
|
|
|
|
try:
|
|
req = urllib.request.Request(
|
|
config["url"],
|
|
headers=headers,
|
|
method=config["method"]
|
|
)
|
|
resp = urllib.request.urlopen(req, timeout=config["timeout"])
|
|
|
|
if resp.status == 200:
|
|
return True, "Healthy"
|
|
else:
|
|
return False, "HTTP " + str(resp.status)
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 401:
|
|
return False, "Unauthorized (401)"
|
|
elif e.code == 403:
|
|
return False, "Forbidden (403)"
|
|
elif e.code == 429:
|
|
return True, "Rate limited but accessible"
|
|
else:
|
|
return False, "HTTP " + str(e.code)
|
|
except Exception as e:
|
|
return False, str(e)[:100]
|
|
|
|
def get_all_providers():
|
|
"""Get all providers from profiles and global config."""
|
|
providers = set()
|
|
|
|
# Global config
|
|
global_config = HERMES_HOME / "config.yaml"
|
|
if global_config.exists():
|
|
try:
|
|
with open(global_config) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
# Primary model provider
|
|
model_config = config.get("model", {})
|
|
if isinstance(model_config, dict):
|
|
provider = model_config.get("provider", "")
|
|
if provider:
|
|
providers.add(provider)
|
|
|
|
# Auxiliary providers
|
|
auxiliary = config.get("auxiliary", {})
|
|
for aux_config in auxiliary.values():
|
|
if isinstance(aux_config, dict):
|
|
provider = aux_config.get("provider", "")
|
|
if provider and provider != "auto":
|
|
providers.add(provider)
|
|
except:
|
|
pass
|
|
|
|
# Profile configs
|
|
if PROFILES_DIR.exists():
|
|
for profile_dir in PROFILES_DIR.iterdir():
|
|
if profile_dir.is_dir():
|
|
config_file = profile_dir / "config.yaml"
|
|
if config_file.exists():
|
|
try:
|
|
with open(config_file) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
model_config = config.get("model", {})
|
|
if isinstance(model_config, dict):
|
|
provider = model_config.get("provider", "")
|
|
if provider:
|
|
providers.add(provider)
|
|
|
|
auxiliary = config.get("auxiliary", {})
|
|
for aux_config in auxiliary.values():
|
|
if isinstance(aux_config, dict):
|
|
provider = aux_config.get("provider", "")
|
|
if provider and provider != "auto":
|
|
providers.add(provider)
|
|
except:
|
|
pass
|
|
|
|
# Add common providers even if not configured
|
|
providers.update(["openrouter", "nous", "ollama"])
|
|
|
|
return list(providers)
|
|
|
|
def build_health_map():
|
|
"""Build a health map of all providers."""
|
|
providers = get_all_providers()
|
|
health_map = {}
|
|
|
|
log("Testing " + str(len(providers)) + " providers...")
|
|
|
|
for provider in providers:
|
|
api_key = get_provider_api_key(provider)
|
|
healthy, message = test_provider(provider, api_key)
|
|
|
|
health_map[provider] = {
|
|
"healthy": healthy,
|
|
"message": message,
|
|
"last_test": datetime.now(timezone.utc).isoformat(),
|
|
"api_key_present": bool(api_key)
|
|
}
|
|
|
|
status = "HEALTHY" if healthy else "UNHEALTHY"
|
|
log(" " + provider + ": " + status + " - " + message)
|
|
|
|
return health_map
|
|
|
|
def get_fallback_providers(health_map):
|
|
"""Get list of healthy providers in priority order."""
|
|
# Priority order: nous, openrouter, ollama, others
|
|
priority_order = ["nous", "openrouter", "ollama", "anthropic", "kimi-coding"]
|
|
|
|
healthy = []
|
|
for provider in priority_order:
|
|
if provider in health_map and health_map[provider]["healthy"]:
|
|
healthy.append(provider)
|
|
|
|
# Add any other healthy providers not in priority list
|
|
for provider, info in health_map.items():
|
|
if info["healthy"] and provider not in healthy:
|
|
healthy.append(provider)
|
|
|
|
return healthy
|
|
|
|
def update_profile_config(profile_name, new_provider):
|
|
"""Update a profile's config to use a new provider."""
|
|
config_file = PROFILES_DIR / profile_name / "config.yaml"
|
|
|
|
if not config_file.exists():
|
|
return False, "Config file not found"
|
|
|
|
try:
|
|
with open(config_file) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
# Update model provider
|
|
if "model" not in config:
|
|
config["model"] = {}
|
|
|
|
old_provider = config["model"].get("provider", "unknown")
|
|
config["model"]["provider"] = new_provider
|
|
|
|
# Update auxiliary providers if they were using the old provider
|
|
auxiliary = config.get("auxiliary", {})
|
|
for aux_name, aux_config in auxiliary.items():
|
|
if isinstance(aux_config, dict) and aux_config.get("provider") == old_provider:
|
|
aux_config["provider"] = new_provider
|
|
|
|
# Write back
|
|
with open(config_file, "w") as f:
|
|
yaml.dump(config, f, default_flow_style=False)
|
|
|
|
log("Updated " + profile_name + ": " + old_provider + " -> " + new_provider)
|
|
return True, "Updated"
|
|
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
def check_profiles(health_map):
|
|
"""Check all profiles and update unhealthy providers."""
|
|
if not PROFILES_DIR.exists():
|
|
return
|
|
|
|
fallback_providers = get_fallback_providers(health_map)
|
|
if not fallback_providers:
|
|
log("CRITICAL: No healthy providers available!")
|
|
return
|
|
|
|
updated_profiles = []
|
|
|
|
for profile_dir in PROFILES_DIR.iterdir():
|
|
if not profile_dir.is_dir():
|
|
continue
|
|
|
|
profile_name = profile_dir.name
|
|
config_file = profile_dir / "config.yaml"
|
|
|
|
if not config_file.exists():
|
|
continue
|
|
|
|
try:
|
|
with open(config_file) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
model_config = config.get("model", {})
|
|
if not isinstance(model_config, dict):
|
|
continue
|
|
|
|
current_provider = model_config.get("provider", "")
|
|
if not current_provider:
|
|
continue
|
|
|
|
# Check if current provider is healthy
|
|
if current_provider in health_map and health_map[current_provider]["healthy"]:
|
|
continue # Provider is healthy, no action needed
|
|
|
|
# Find best fallback
|
|
best_fallback = None
|
|
for provider in fallback_providers:
|
|
if provider != current_provider:
|
|
best_fallback = provider
|
|
break
|
|
|
|
if not best_fallback:
|
|
log("No fallback for " + profile_name + " (current: " + current_provider + ")")
|
|
continue
|
|
|
|
# Update profile
|
|
success, message = update_profile_config(profile_name, best_fallback)
|
|
if success:
|
|
updated_profiles.append({
|
|
"profile": profile_name,
|
|
"old_provider": current_provider,
|
|
"new_provider": best_fallback
|
|
})
|
|
|
|
except Exception as e:
|
|
log("Error processing " + profile_name + ": " + str(e))
|
|
|
|
return updated_profiles
|
|
|
|
def load_state():
|
|
"""Load state from tmux-state.json."""
|
|
if STATE_FILE.exists():
|
|
try:
|
|
with open(STATE_FILE) as f:
|
|
return json.load(f)
|
|
except:
|
|
pass
|
|
return {}
|
|
|
|
def save_state(state):
|
|
"""Save state to tmux-state.json."""
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(STATE_FILE, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
def run_once():
|
|
"""Run provider health check once."""
|
|
log("=== Provider Health Check ===")
|
|
|
|
state = load_state()
|
|
|
|
# Build health map
|
|
health_map = build_health_map()
|
|
|
|
# Check profiles and update if needed
|
|
updated_profiles = check_profiles(health_map)
|
|
|
|
# Update state
|
|
state["provider_health"] = health_map
|
|
state["last_provider_check"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
if updated_profiles:
|
|
state["last_profile_updates"] = updated_profiles
|
|
|
|
save_state(state)
|
|
|
|
# Summary
|
|
healthy_count = sum(1 for p in health_map.values() if p["healthy"])
|
|
total_count = len(health_map)
|
|
|
|
log("Health: " + str(healthy_count) + "/" + str(total_count) + " providers healthy")
|
|
|
|
if updated_profiles:
|
|
log("Updated " + str(len(updated_profiles)) + " profiles:")
|
|
for update in updated_profiles:
|
|
log(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"])
|
|
|
|
def show_status():
|
|
"""Show provider health status."""
|
|
state = load_state()
|
|
health_map = state.get("provider_health", {})
|
|
|
|
if not health_map:
|
|
print("No provider health data available. Run without --status first.")
|
|
return
|
|
|
|
print("Provider Health (last updated: " + str(state.get("last_provider_check", "unknown")) + ")")
|
|
print("=" * 80)
|
|
|
|
for provider, info in sorted(health_map.items()):
|
|
status = "HEALTHY" if info["healthy"] else "UNHEALTHY"
|
|
message = info.get("message", "")
|
|
api_key = "yes" if info.get("api_key_present") else "no"
|
|
|
|
print(provider.ljust(20) + " " + status.ljust(10) + " API key: " + api_key + " - " + message)
|
|
|
|
# Show recent updates
|
|
updates = state.get("last_profile_updates", [])
|
|
if updates:
|
|
print()
|
|
print("Recent Profile Updates:")
|
|
for update in updates:
|
|
print(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"])
|
|
|
|
def daemon_mode():
|
|
"""Run continuously."""
|
|
log("Starting provider health daemon (check every 300s)")
|
|
|
|
while True:
|
|
try:
|
|
run_once()
|
|
time.sleep(300) # Check every 5 minutes
|
|
except KeyboardInterrupt:
|
|
log("Daemon stopped by user")
|
|
break
|
|
except Exception as e:
|
|
log("Error: " + str(e))
|
|
time.sleep(60)
|
|
|
|
def main():
|
|
if "--status" in sys.argv:
|
|
show_status()
|
|
elif "--daemon" in sys.argv:
|
|
daemon_mode()
|
|
else:
|
|
run_once()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|