#!/usr/bin/env python3 """ Provider Health Monitor Script Issue #509: [Robustness] Provider-aware profile config — auto-switch on failure Monitors provider health and automatically switches profiles to working providers. Usage: python3 provider-health-monitor.py # Run once python3 provider-health-monitor.py --daemon # Run continuously python3 provider-health-monitor.py --status # Show provider health """ import os, sys, json, yaml, urllib.request, time from datetime import datetime, timezone from pathlib import Path # Configuration HERMES_HOME = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) PROFILES_DIR = HERMES_HOME / "profiles" LOG_DIR = Path.home() / ".local" / "timmy" / "fleet-health" STATE_FILE = LOG_DIR / "tmux-state.json" LOG_FILE = LOG_DIR / "provider-health.log" # Provider test endpoints PROVIDER_TESTS = { "openrouter": { "url": "https://openrouter.ai/api/v1/models", "method": "GET", "headers": lambda api_key: {"Authorization": "Bearer " + api_key}, "timeout": 10 }, "anthropic": { "url": "https://api.anthropic.com/v1/models", "method": "GET", "headers": lambda api_key: {"x-api-key": api_key, "anthropic-version": "2023-06-01"}, "timeout": 10 }, "nous": { "url": "https://inference.nousresearch.com/v1/models", "method": "GET", "headers": lambda api_key: {"Authorization": "Bearer " + api_key}, "timeout": 10 }, "kimi-coding": { "url": "https://api.kimi.com/coding/v1/models", "method": "GET", "headers": lambda api_key: {"x-api-key": api_key, "x-api-provider": "kimi-coding"}, "timeout": 10 }, "ollama": { "url": "http://localhost:11434/api/tags", "method": "GET", "headers": lambda api_key: {}, "timeout": 5 } } def log(msg): """Log message to file and optionally console.""" timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") log_entry = "[" + timestamp + "] " + msg LOG_DIR.mkdir(parents=True, exist_ok=True) with open(LOG_FILE, "a") as f: f.write(log_entry + "\n") if "--quiet" not in sys.argv: print(log_entry) def get_provider_api_key(provider): """Get API key for a provider from .env or environment.""" env_file = HERMES_HOME / ".env" if env_file.exists(): with open(env_file) as f: for line in f: line = line.strip() if line.startswith(provider.upper() + "_API_KEY="): return line.split("=", 1)[1].strip().strip("'\"") return os.environ.get(provider.upper() + "_API_KEY") def test_provider(provider, api_key=None): """Test if a provider is healthy.""" config = PROVIDER_TESTS.get(provider) if not config: return False, "Unknown provider: " + provider headers = config["headers"](api_key or "") try: req = urllib.request.Request( config["url"], headers=headers, method=config["method"] ) resp = urllib.request.urlopen(req, timeout=config["timeout"]) if resp.status == 200: return True, "Healthy" else: return False, "HTTP " + str(resp.status) except urllib.error.HTTPError as e: if e.code == 401: return False, "Unauthorized (401)" elif e.code == 403: return False, "Forbidden (403)" elif e.code == 429: return True, "Rate limited but accessible" else: return False, "HTTP " + str(e.code) except Exception as e: return False, str(e)[:100] def get_all_providers(): """Get all providers from profiles and global config.""" providers = set() # Global config global_config = HERMES_HOME / "config.yaml" if global_config.exists(): try: with open(global_config) as f: config = yaml.safe_load(f) # Primary model provider model_config = config.get("model", {}) if isinstance(model_config, dict): provider = model_config.get("provider", "") if provider: providers.add(provider) # Auxiliary providers auxiliary = config.get("auxiliary", {}) for aux_config in auxiliary.values(): if isinstance(aux_config, dict): provider = aux_config.get("provider", "") if provider and provider != "auto": providers.add(provider) except: pass # Profile configs if PROFILES_DIR.exists(): for profile_dir in PROFILES_DIR.iterdir(): if profile_dir.is_dir(): config_file = profile_dir / "config.yaml" if config_file.exists(): try: with open(config_file) as f: config = yaml.safe_load(f) model_config = config.get("model", {}) if isinstance(model_config, dict): provider = model_config.get("provider", "") if provider: providers.add(provider) auxiliary = config.get("auxiliary", {}) for aux_config in auxiliary.values(): if isinstance(aux_config, dict): provider = aux_config.get("provider", "") if provider and provider != "auto": providers.add(provider) except: pass # Add common providers even if not configured providers.update(["openrouter", "nous", "ollama"]) return list(providers) def build_health_map(): """Build a health map of all providers.""" providers = get_all_providers() health_map = {} log("Testing " + str(len(providers)) + " providers...") for provider in providers: api_key = get_provider_api_key(provider) healthy, message = test_provider(provider, api_key) health_map[provider] = { "healthy": healthy, "message": message, "last_test": datetime.now(timezone.utc).isoformat(), "api_key_present": bool(api_key) } status = "HEALTHY" if healthy else "UNHEALTHY" log(" " + provider + ": " + status + " - " + message) return health_map def get_fallback_providers(health_map): """Get list of healthy providers in priority order.""" # Priority order: nous, openrouter, ollama, others priority_order = ["nous", "openrouter", "ollama", "anthropic", "kimi-coding"] healthy = [] for provider in priority_order: if provider in health_map and health_map[provider]["healthy"]: healthy.append(provider) # Add any other healthy providers not in priority list for provider, info in health_map.items(): if info["healthy"] and provider not in healthy: healthy.append(provider) return healthy def update_profile_config(profile_name, new_provider): """Update a profile's config to use a new provider.""" config_file = PROFILES_DIR / profile_name / "config.yaml" if not config_file.exists(): return False, "Config file not found" try: with open(config_file) as f: config = yaml.safe_load(f) # Update model provider if "model" not in config: config["model"] = {} old_provider = config["model"].get("provider", "unknown") config["model"]["provider"] = new_provider # Update auxiliary providers if they were using the old provider auxiliary = config.get("auxiliary", {}) for aux_name, aux_config in auxiliary.items(): if isinstance(aux_config, dict) and aux_config.get("provider") == old_provider: aux_config["provider"] = new_provider # Write back with open(config_file, "w") as f: yaml.dump(config, f, default_flow_style=False) log("Updated " + profile_name + ": " + old_provider + " -> " + new_provider) return True, "Updated" except Exception as e: return False, str(e) def check_profiles(health_map): """Check all profiles and update unhealthy providers.""" if not PROFILES_DIR.exists(): return fallback_providers = get_fallback_providers(health_map) if not fallback_providers: log("CRITICAL: No healthy providers available!") return updated_profiles = [] for profile_dir in PROFILES_DIR.iterdir(): if not profile_dir.is_dir(): continue profile_name = profile_dir.name config_file = profile_dir / "config.yaml" if not config_file.exists(): continue try: with open(config_file) as f: config = yaml.safe_load(f) model_config = config.get("model", {}) if not isinstance(model_config, dict): continue current_provider = model_config.get("provider", "") if not current_provider: continue # Check if current provider is healthy if current_provider in health_map and health_map[current_provider]["healthy"]: continue # Provider is healthy, no action needed # Find best fallback best_fallback = None for provider in fallback_providers: if provider != current_provider: best_fallback = provider break if not best_fallback: log("No fallback for " + profile_name + " (current: " + current_provider + ")") continue # Update profile success, message = update_profile_config(profile_name, best_fallback) if success: updated_profiles.append({ "profile": profile_name, "old_provider": current_provider, "new_provider": best_fallback }) except Exception as e: log("Error processing " + profile_name + ": " + str(e)) return updated_profiles def load_state(): """Load state from tmux-state.json.""" if STATE_FILE.exists(): try: with open(STATE_FILE) as f: return json.load(f) except: pass return {} def save_state(state): """Save state to tmux-state.json.""" LOG_DIR.mkdir(parents=True, exist_ok=True) with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) def run_once(): """Run provider health check once.""" log("=== Provider Health Check ===") state = load_state() # Build health map health_map = build_health_map() # Check profiles and update if needed updated_profiles = check_profiles(health_map) # Update state state["provider_health"] = health_map state["last_provider_check"] = datetime.now(timezone.utc).isoformat() if updated_profiles: state["last_profile_updates"] = updated_profiles save_state(state) # Summary healthy_count = sum(1 for p in health_map.values() if p["healthy"]) total_count = len(health_map) log("Health: " + str(healthy_count) + "/" + str(total_count) + " providers healthy") if updated_profiles: log("Updated " + str(len(updated_profiles)) + " profiles:") for update in updated_profiles: log(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"]) def show_status(): """Show provider health status.""" state = load_state() health_map = state.get("provider_health", {}) if not health_map: print("No provider health data available. Run without --status first.") return print("Provider Health (last updated: " + str(state.get("last_provider_check", "unknown")) + ")") print("=" * 80) for provider, info in sorted(health_map.items()): status = "HEALTHY" if info["healthy"] else "UNHEALTHY" message = info.get("message", "") api_key = "yes" if info.get("api_key_present") else "no" print(provider.ljust(20) + " " + status.ljust(10) + " API key: " + api_key + " - " + message) # Show recent updates updates = state.get("last_profile_updates", []) if updates: print() print("Recent Profile Updates:") for update in updates: print(" " + update["profile"] + ": " + update["old_provider"] + " -> " + update["new_provider"]) def daemon_mode(): """Run continuously.""" log("Starting provider health daemon (check every 300s)") while True: try: run_once() time.sleep(300) # Check every 5 minutes except KeyboardInterrupt: log("Daemon stopped by user") break except Exception as e: log("Error: " + str(e)) time.sleep(60) def main(): if "--status" in sys.argv: show_status() elif "--daemon" in sys.argv: daemon_mode() else: run_once() if __name__ == "__main__": main()