diff --git a/scripts/self_healing.py b/scripts/self_healing.py index bed0f759..8fa65f66 100644 --- a/scripts/self_healing.py +++ b/scripts/self_healing.py @@ -4,6 +4,8 @@ Part of the Gemini Sovereign Infrastructure Suite. Auto-detects and fixes common failures across the fleet. + +Safe-by-default: runs in dry-run mode unless --execute is given. """ import os @@ -11,6 +13,7 @@ import sys import subprocess import argparse import requests +import datetime # --- CONFIGURATION --- FLEET = { @@ -21,51 +24,210 @@ FLEET = { } class SelfHealer: + def __init__(self, dry_run=True, confirm_kill=False, yes=False): + self.dry_run = dry_run + self.confirm_kill = confirm_kill + self.yes = yes + def log(self, message: str): - print(f"[*] {message}") + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print(f"[{timestamp}] {message}") def run_remote(self, host: str, command: str): ip = FLEET[host]["ip"] - ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", command] + ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5", f"root@{ip}", command] if host == "mac": ssh_cmd = ["bash", "-c", command] try: - return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10) - except: + return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=15) + except Exception as e: + self.log(f" [ERROR] Failed to run remote command on {host}: {e}") return None + def confirm(self, prompt: str) -> bool: + """Ask for confirmation unless --yes flag is set.""" + if self.yes: + return True + while True: + response = input(f"{prompt} [y/N] ").strip().lower() + if response in ("y", "yes"): + return True + elif response in ("n", "no", ""): + return False + print("Please answer 'y' or 'n'.") + + def check_llama_server(self, host: str): + ip = FLEET[host]["ip"] + port = FLEET[host]["port"] + try: + requests.get(f"http://{ip}:{port}/health", timeout=2) + except: + self.log(f" [!] llama-server down on {host}.") + if self.dry_run: + self.log(f" [DRY-RUN] Would restart llama-server on {host}") + else: + if self.confirm(f" Restart llama-server on {host}?"): + self.log(f" Restarting llama-server on {host}...") + self.run_remote(host, "systemctl restart llama-server") + else: + self.log(f" Skipped restart on {host}.") + + def check_disk_space(self, host: str): + res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'") + if res and res.returncode == 0: + try: + usage = int(res.stdout.strip()) + if usage > 90: + self.log(f" [!] Disk usage high on {host} ({usage}%).") + if self.dry_run: + self.log(f" [DRY-RUN] Would clean logs and vacuum journal on {host}") + else: + if self.confirm(f" Clean logs on {host}?"): + self.log(f" Cleaning logs on {host}...") + self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz") + else: + self.log(f" Skipped log cleaning on {host}.") + except: + pass + + def check_memory(self, host: str): + res = self.run_remote(host, "free -m | awk '/^Mem:/{print $3/$2 * 100}'") + if res and res.returncode == 0: + try: + usage = float(res.stdout.strip()) + if usage > 90: + self.log(f" [!] Memory usage high on {host} ({usage:.1f}%).") + if self.dry_run: + self.log(f" [DRY-RUN] Would check for memory hogs on {host}") + else: + self.log(f" Memory high but no automatic action defined.") + except: + pass + + def check_processes(self, host: str): + # Example: check if any process uses > 80% CPU + res = self.run_remote(host, "ps aux --sort=-%cpu | awk 'NR>1 && $3>80 {print $2, $11, $3}'") + if res and res.returncode == 0 and res.stdout.strip(): + self.log(f" [!] High CPU processes on {host}:") + for line in res.stdout.strip().split('\n'): + self.log(f" {line}") + if self.dry_run: + self.log(f" [DRY-RUN] Would review high-CPU processes on {host}") + else: + if self.confirm_kill: + if self.confirm(f" Kill high-CPU processes on {host}? (dangerous)"): + # This is a placeholder; real implementation would parse PIDs + self.log(f" Process killing not implemented yet (placeholder).") + else: + self.log(f" Skipped killing processes on {host}.") + else: + self.log(f" Use --confirm-kill to enable process termination (dangerous).") + def check_and_heal(self): for host in FLEET: self.log(f"Auditing {host}...") - - # 1. Check llama-server - ip = FLEET[host]["ip"] - port = FLEET[host]["port"] - try: - requests.get(f"http://{ip}:{port}/health", timeout=2) - except: - self.log(f" [!] llama-server down on {host}. Attempting restart...") - self.run_remote(host, "systemctl restart llama-server") - - # 2. Check disk space - res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'") - if res and res.returncode == 0: - try: - usage = int(res.stdout.strip()) - if usage > 90: - self.log(f" [!] Disk usage high on {host} ({usage}%). Cleaning logs...") - self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz") - except: - pass + self.check_llama_server(host) + self.check_disk_space(host) + self.check_memory(host) + self.check_processes(host) def run(self): - self.log("Starting self-healing cycle...") + if self.dry_run: + self.log("Starting self-healing cycle (DRY-RUN mode).") + else: + self.log("Starting self-healing cycle (EXECUTE mode).") self.check_and_heal() self.log("Cycle complete.") +def print_help_safe(): + """Print detailed explanation of what each action does.""" + help_text = """ +SAFE-BY-DEFAULT SELF-HEALING SCRIPT + +This script checks fleet health and can optionally fix issues. + +DEFAULT MODE: DRY-RUN (safe) + - Only reports what it would do, does not make changes. + - Use --execute to actually perform fixes. + +CHECKS PERFORMED: + 1. llama-server health + - Checks if llama-server is responding on each host. + - Action: restart service (requires --execute and confirmation). + + 2. Disk space + - Checks root partition usage on each host. + - Action: vacuum journal logs and remove rotated logs if >90% (requires --execute and confirmation). + + 3. Memory usage + - Reports high memory usage (informational only, no automatic action). + + 4. Process health + - Lists processes using >80% CPU. + - Action: kill processes (requires --confirm-kill flag, --execute, and confirmation). + +SAFETY FEATURES: + - Dry-run by default. + - Explicit --execute flag required for changes. + - Confirmation prompts for all destructive actions. + - --yes flag to skip confirmations (for automation). + - --confirm-kill flag required to even consider killing processes. + - Timestamps on all log messages. + +EXAMPLES: + python3 scripts/self_healing.py + # Dry-run: safe, shows what would happen. + + python3 scripts/self_healing.py --execute + # Actually perform fixes after confirmation. + + python3 scripts/self_healing.py --execute --yes + # Perform fixes without prompts (automation). + + python3 scripts/self_healing.py --execute --confirm-kill + # Allow killing processes (dangerous). + + python3 scripts/self_healing.py --help-safe + # Show this help. +""" + print(help_text) + def main(): - healer = SelfHealer() + parser = argparse.ArgumentParser( + description="Self-healing infrastructure script (safe-by-default).", + add_help=False # We'll handle --help ourselves + ) + parser.add_argument("--dry-run", action="store_true", default=False, + help="Run in dry-run mode (default behavior).") + parser.add_argument("--execute", action="store_true", default=False, + help="Actually perform fixes (disables dry-run).") + parser.add_argument("--confirm-kill", action="store_true", default=False, + help="Allow killing processes (dangerous).") + parser.add_argument("--yes", "-y", action="store_true", default=False, + help="Skip confirmation prompts.") + parser.add_argument("--help-safe", action="store_true", default=False, + help="Show detailed help about safety features.") + parser.add_argument("--help", "-h", action="store_true", default=False, + help="Show standard help.") + + args = parser.parse_args() + + if args.help_safe: + print_help_safe() + sys.exit(0) + + if args.help: + parser.print_help() + sys.exit(0) + + # Determine mode: if --execute is given, disable dry-run + dry_run = not args.execute + # If --dry-run is explicitly given, ensure dry-run (redundant but clear) + if args.dry_run: + dry_run = True + + healer = SelfHealer(dry_run=dry_run, confirm_kill=args.confirm_kill, yes=args.yes) healer.run() if __name__ == "__main__": - main() + main() \ No newline at end of file