#!/usr/bin/env python3 """ [OPS] Self-Healing Infrastructure Part of the Gemini Sovereign Infrastructure Suite. Auto-detects and fixes common failures across the fleet. """ import os import sys import subprocess import argparse import requests # --- CONFIGURATION --- FLEET = { "mac": {"ip": "10.1.10.77", "port": 8080}, "ezra": {"ip": "143.198.27.163", "port": 8080}, "allegro": {"ip": "167.99.126.228", "port": 8080}, "bezalel": {"ip": "159.203.146.185", "port": 8080} } class SelfHealer: def log(self, message: str): print(f"[*] {message}") def run_remote(self, host: str, command: str): ip = FLEET[host]["ip"] ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", command] if host == "mac": ssh_cmd = ["bash", "-c", command] try: return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10) except: return None def check_and_heal(self): for host in FLEET: self.log(f"Auditing {host}...") # 1. Check llama-server ip = FLEET[host]["ip"] port = FLEET[host]["port"] try: requests.get(f"http://{ip}:{port}/health", timeout=2) except: self.log(f" [!] llama-server down on {host}. Attempting restart...") self.run_remote(host, "systemctl restart llama-server") # 2. Check disk space res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'") if res and res.returncode == 0: try: usage = int(res.stdout.strip()) if usage > 90: self.log(f" [!] Disk usage high on {host} ({usage}%). Cleaning logs...") self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz") except: pass def run(self): self.log("Starting self-healing cycle...") self.check_and_heal() self.log("Cycle complete.") def main(): healer = SelfHealer() healer.run() if __name__ == "__main__": main()