diff --git a/bin/provider-health-monitor.py b/bin/provider-health-monitor.py new file mode 100644 index 00000000..373ea62d --- /dev/null +++ b/bin/provider-health-monitor.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python3 +""" +Provider Health Monitor Script +Issue #509: [Robustness] Provider-aware profile config — auto-switch on failure + +Monitors provider health and automatically switches profiles to working providers. + +Usage: + python3 provider-health-monitor.py # Run once + python3 provider-health-monitor.py --daemon # Run continuously + python3 provider-health-monitor.py --status # Show provider health +""" + +import os, sys, json, yaml, urllib.request, time +from datetime import datetime, timezone +from pathlib import Path + +# Configuration +HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) +PROFILES_DIR = HERMES_HOME / "profiles" +LOG_DIR = Path.home() / ".local" / "timmy" / "fleet-health" +STATE_FILE = LOG_DIR / "tmux-state.json" +LOG_FILE = LOG_DIR / "provider-health.log" + +# Provider test endpoints +PROVIDER_TESTS = { + "openrouter": { + "url": "https://openrouter.ai/api/v1/models", + "method": "GET", + "headers": lambda api_key: {"Authorization": "Bearer " + api_key}, + "timeout": 10 + }, + "anthropic": { + "url": "https://api.anthropic.com/v1/models", + "method": "GET", + "headers": lambda api_key: {"x-api-key": api_key, "anthropic-version": "2023-06-01"}, + "timeout": 10 + }, + "nous": { + "url": "https://inference.nousresearch.com/v1/models", + "method": "GET", + "headers": lambda api_key: {"Authorization": "Bearer " + api_key}, + "timeout": 10 + }, + "kimi-coding": { + "url": "https://api.kimi.com/coding/v1/models", + "method": "GET", + "headers": lambda api_key: {"x-api-key": api_key, "x-api-provider": "kimi-coding"}, + "timeout": 10 + }, + "ollama": { + "url": "http://localhost:11434/api/tags", + "method": "GET", + "headers": lambda api_key: {}, + "timeout": 5 + } +} + +def log(msg): + """Log message to file and optionally console.""" + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") + log_entry = "[" + timestamp + "] " + msg + + LOG_DIR.mkdir(parents=True, exist_ok=True) + with open(LOG_FILE, "a") as f: + f.write(log_entry + "\n") + + if "--quiet" not in sys.argv: + print(log_entry) + +def get_provider_api_key(provider): + """Get API key for a provider from .env or environment.""" + env_file = HERMES_HOME / ".env" + if env_file.exists(): + with open(env_file) as f: + for line in f: + line = line.strip() + if line.startswith(provider.upper() + "_API_KEY="): + return line.split("=", 1)[1].strip().strip("'\"") + + return os.environ.get(provider.upper() + "_API_KEY") + +def test_provider(provider, api_key=None): + """Test if a provider is healthy.""" + config = PROVIDER_TESTS.get(provider) + if not config: + return False, "Unknown provider: " + provider + + headers = config["headers"](api_key or "") + + try: + req = urllib.request.Request( + config["url"], + headers=headers, + method=config["method"] + ) + resp = urllib.request.urlopen(req, timeout=config["timeout"]) + + if resp.status == 200: + return True, "Healthy" + else: + return False, "HTTP " + str(resp.status) + + except urllib.error.HTTPError as e: + if e.code == 401: + return False, "Unauthorized (401)" + elif e.code == 403: + return False, "Forbidden (403)" + elif e.code == 429: + return True, "Rate limited but accessible" + else: + return False, "HTTP " + str(e.code) + except Exception as e: + return False, str(e)[:100] + +def get_all_providers(): + """Get all providers from profiles and global config.""" + providers = set() + + # Global config + global_config = HERMES_HOME / "config.yaml" + if global_config.exists(): + try: + with open(global_config) as f: + config = yaml.safe_load(f) + + # Primary model provider + model_config = config.get("model", {}) + if isinstance(model_config, dict): + provider = model_config.get("provider", "") + if provider: + providers.add(provider) + + # Auxiliary providers + auxiliary = config.get("auxiliary", {}) + for aux_config in auxiliary.values(): + if isinstance(aux_config, dict): + provider = aux_config.get("provider", "") + if provider and provider != "auto": + providers.add(provider) + except: + pass + + # Profile configs + if PROFILES_DIR.exists(): + for profile_dir in PROFILES_DIR.iterdir(): + if profile_dir.is_dir(): + config_file = profile_dir / "config.yaml" + if config_file.exists(): + try: + with open(config_file) as f: + config = yaml.safe_load(f) + + model_config = config.get("model", {}) + if isinstance(model_config, dict): + provider = model_config.get("provider", "") + if provider: + providers.add(provider) + + auxiliary = config.get("auxiliary", {}) + for aux_config in auxiliary.values(): + if isinstance(aux_config, dict): + provider = aux_config.get("provider", "") + if provider and provider != "auto": + providers.add(provider) + except: + pass + + # Add common providers even if not configured + providers.update(["openrouter", "nous", "ollama"]) + + return list(providers) + +def build_health_map(): + """Build a health map of all providers.""" + providers = get_all_providers() + health_map = {} + + log("Testing " + str(len(providers)) + " providers...") + + for provider in providers: + api_key = get_provider_api_key(provider) + healthy, message = test_provider(provider, api_key) + + health_map[provider] = { + "healthy": healthy, + "message": message, + "last_test": datetime.now(timezone.utc).isoformat(), + "api_key_present": bool(api_key) + } + + status = "HEALTHY" if healthy else "UNHEALTHY" + log(" " + provider + ": " + status + " - " + message) + + return health_map + +def get_fallback_providers(health_map): + """Get list of healthy providers in priority order.""" + # Priority order: nous, openrouter, ollama, others + priority_order = ["nous", "openrouter", "ollama", "anthropic", "kimi-coding"] + + healthy = [] + for provider in priority_order: + if provider in health_map and health_map[provider]["healthy"]: + healthy.append(provider) + + # Add any other healthy providers not in priority list + for provider, info in health_map.items(): + if info["healthy"] and provider not in healthy: + healthy.append(provider) + + return healthy + +def update_profile_config(profile_name, new_provider): + """Update a profile's config to use a new provider.""" + config_file = PROFILES_DIR / profile_name / "config.yaml" + + if not config_file.exists(): + return False, "Config file not found" + + try: + with open(config_file) as f: + config = yaml.safe_load(f) + + # Update model provider + if "model" not in config: + config["model"] = {} + + old_provider = config["model"].get("provider", "unknown") + config["model"]["provider"] = new_provider + + # Update auxiliary providers if they were using the old provider + auxiliary = config.get("auxiliary", {}) + for aux_name, aux_config in auxiliary.items(): + if isinstance(aux_config, dict) and aux_config.get("provider") == old_provider: + aux_config["provider"] = new_provider + + # Write back + with open(config_file, "w") as f: + yaml.dump(config, f, default_flow_style=False) + + log("Updated " + profile_name + ": " + old_provider + " -> " + new_provider) + return True, "Updated" + + except Exception as e: + return False, str(e) + +def check_profiles(health_map): + """Check all profiles and update unhealthy providers.""" + if not PROFILES_DIR.exists(): + return + + fallback_providers = get_fallback_providers(health_map) + if not fallback_providers: + log("CRITICAL: No healthy providers available!") + return + + updated_profiles = [] + + for profile_dir in PROFILES_DIR.iterdir(): + if not profile_dir.is_dir(): + continue + + profile_name = profile_dir.name + config_file = profile_dir / "config.yaml" + + if not config_file.exists(): + continue + + try: + with open(config_file) as f: + config = yaml.safe_load(f) + + model_config = config.get("model", {}) + if not isinstance(model_config, dict): + continue + + current_provider = model_config.get("provider", "") + if not current_provider: + continue + + # Check if current provider is healthy + if current_provider in health_map and health_map[current_provider]["healthy"]: + continue # Provider is healthy, no action needed + + # Find best fallback + best_fallback = None + for provider in fallback_providers: + if provider != current_provider: + best_fallback = provider + break + + if not best_fallback: + log("No fallback for " + profile_name + " (current: " + current_provider + ")") + continue + + # Update profile + success, message = update_profile_config(profile_name, best_fallback) + if success: + updated_profiles.append({ + "profile": profile_name, + "old_provider": current_provider, + "new_provider": best_fallback + }) + + except Exception as e: + log("Error processing " + profile_name + ": " + str(e)) + + return updated_profiles + +def load_state(): + """Load state from tmux-state.json.""" + if STATE_FILE.exists(): + try: + with open(STATE_FILE) as f: + return json.load(f) + except: + pass + return {} + +def save_state(state): + """Save state to tmux-state.json.""" + LOG_DIR.mkdir(parents=True, exist_ok=True) + + with open(STATE_FILE, "w") as f: + json.dump(state, f, indent=2) + +def run_once(): + """Run provider health check once.""" + log("=== Provider Health Check ===") + + state = load_state() + + # Build health map + health_map = build_health_map() + + # Check profiles and update if needed + updated_profiles = check_profiles(health_map) + + # Update state + state["provider_health"] = health_map + state["last_provider_check"] = datetime.now(timezone.utc).isoformat() + + if updated_profiles: + state["last_profile_updates"] = updated_profiles + + save_state(state) + + # Summary + healthy_count = sum(1 for p in health_map.values() if p["healthy"]) + total_count = len(health_map) + + log("Health: " + str(healthy_count) + "/" + str(total_count) + " providers healthy") + + if updated_profiles: + log("Updated " + str(len(updated_profiles)) + " profiles:") + for update in updated_profiles: + log(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"]) + +def show_status(): + """Show provider health status.""" + state = load_state() + health_map = state.get("provider_health", {}) + + if not health_map: + print("No provider health data available. Run without --status first.") + return + + print("Provider Health (last updated: " + str(state.get("last_provider_check", "unknown")) + ")") + print("=" * 80) + + for provider, info in sorted(health_map.items()): + status = "HEALTHY" if info["healthy"] else "UNHEALTHY" + message = info.get("message", "") + api_key = "yes" if info.get("api_key_present") else "no" + + print(provider.ljust(20) + " " + status.ljust(10) + " API key: " + api_key + " - " + message) + + # Show recent updates + updates = state.get("last_profile_updates", []) + if updates: + print() + print("Recent Profile Updates:") + for update in updates: + print(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"]) + +def daemon_mode(): + """Run continuously.""" + log("Starting provider health daemon (check every 300s)") + + while True: + try: + run_once() + time.sleep(300) # Check every 5 minutes + except KeyboardInterrupt: + log("Daemon stopped by user") + break + except Exception as e: + log("Error: " + str(e)) + time.sleep(60) + +def main(): + if "--status" in sys.argv: + show_status() + elif "--daemon" in sys.argv: + daemon_mode() + else: + run_once() + +if __name__ == "__main__": + main()