feat: implement self_healing.py
This commit is contained in:
71
scripts/self_healing.py
Normal file
71
scripts/self_healing.py
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
[OPS] Self-Healing Infrastructure
|
||||
Part of the Gemini Sovereign Infrastructure Suite.
|
||||
|
||||
Auto-detects and fixes common failures across the fleet.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
# --- CONFIGURATION ---
|
||||
FLEET = {
|
||||
"mac": {"ip": "10.1.10.77", "port": 8080},
|
||||
"ezra": {"ip": "143.198.27.163", "port": 8080},
|
||||
"allegro": {"ip": "167.99.126.228", "port": 8080},
|
||||
"bezalel": {"ip": "159.203.146.185", "port": 8080}
|
||||
}
|
||||
|
||||
class SelfHealer:
|
||||
def log(self, message: str):
|
||||
print(f"[*] {message}")
|
||||
|
||||
def run_remote(self, host: str, command: str):
|
||||
ip = FLEET[host]["ip"]
|
||||
ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", command]
|
||||
if host == "mac":
|
||||
ssh_cmd = ["bash", "-c", command]
|
||||
try:
|
||||
return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10)
|
||||
except:
|
||||
return None
|
||||
|
||||
def check_and_heal(self):
|
||||
for host in FLEET:
|
||||
self.log(f"Auditing {host}...")
|
||||
|
||||
# 1. Check llama-server
|
||||
ip = FLEET[host]["ip"]
|
||||
port = FLEET[host]["port"]
|
||||
try:
|
||||
requests.get(f"http://{ip}:{port}/health", timeout=2)
|
||||
except:
|
||||
self.log(f" [!] llama-server down on {host}. Attempting restart...")
|
||||
self.run_remote(host, "systemctl restart llama-server")
|
||||
|
||||
# 2. Check disk space
|
||||
res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'")
|
||||
if res and res.returncode == 0:
|
||||
try:
|
||||
usage = int(res.stdout.strip())
|
||||
if usage > 90:
|
||||
self.log(f" [!] Disk usage high on {host} ({usage}%). Cleaning logs...")
|
||||
self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz")
|
||||
except:
|
||||
pass
|
||||
|
||||
def run(self):
|
||||
self.log("Starting self-healing cycle...")
|
||||
self.check_and_heal()
|
||||
self.log("Cycle complete.")
|
||||
|
||||
def main():
|
||||
healer = SelfHealer()
|
||||
healer.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user