diff --git a/scripts/self_healing.py b/scripts/self_healing.py new file mode 100644 index 00000000..bed0f759 --- /dev/null +++ b/scripts/self_healing.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +[OPS] Self-Healing Infrastructure +Part of the Gemini Sovereign Infrastructure Suite. + +Auto-detects and fixes common failures across the fleet. +""" + +import os +import sys +import subprocess +import argparse +import requests + +# --- CONFIGURATION --- +FLEET = { + "mac": {"ip": "10.1.10.77", "port": 8080}, + "ezra": {"ip": "143.198.27.163", "port": 8080}, + "allegro": {"ip": "167.99.126.228", "port": 8080}, + "bezalel": {"ip": "159.203.146.185", "port": 8080} +} + +class SelfHealer: + def log(self, message: str): + print(f"[*] {message}") + + def run_remote(self, host: str, command: str): + ip = FLEET[host]["ip"] + ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", command] + if host == "mac": + ssh_cmd = ["bash", "-c", command] + try: + return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10) + except: + return None + + def check_and_heal(self): + for host in FLEET: + self.log(f"Auditing {host}...") + + # 1. Check llama-server + ip = FLEET[host]["ip"] + port = FLEET[host]["port"] + try: + requests.get(f"http://{ip}:{port}/health", timeout=2) + except: + self.log(f" [!] llama-server down on {host}. Attempting restart...") + self.run_remote(host, "systemctl restart llama-server") + + # 2. Check disk space + res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'") + if res and res.returncode == 0: + try: + usage = int(res.stdout.strip()) + if usage > 90: + self.log(f" [!] Disk usage high on {host} ({usage}%). Cleaning logs...") + self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz") + except: + pass + + def run(self): + self.log("Starting self-healing cycle...") + self.check_and_heal() + self.log("Cycle complete.") + +def main(): + healer = SelfHealer() + healer.run() + +if __name__ == "__main__": + main()