timmy-config/scripts/self_healing.py

#!/usr/bin/env python3
"""
[OPS] Self-Healing Infrastructure
Part of the Gemini Sovereign Infrastructure Suite.

Auto-detects and fixes common failures across the fleet.

Safe-by-default: runs in dry-run mode unless --execute is given.
"""

import os
import sys
import subprocess
import argparse
import requests
import datetime

# --- CONFIGURATION ---
FLEET = {
    "mac": {"ip": "10.1.10.77", "port": 8080},
    "ezra": {"ip": "143.198.27.163", "port": 8080},
    "allegro": {"ip": "167.99.126.228", "port": 8080},
    "bezalel": {"ip": "159.203.146.185", "port": 8080}
}

class SelfHealer:
    def __init__(self, dry_run=True, confirm_kill=False, yes=False):
        self.dry_run = dry_run
        self.confirm_kill = confirm_kill
        self.yes = yes

    def log(self, message: str):
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"[{timestamp}] {message}")

    def run_remote(self, host: str, command: str):
        ip = FLEET[host]["ip"]
        ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5", f"root@{ip}", command]
        if host == "mac":
            ssh_cmd = ["bash", "-c", command]
        try:
            return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=15)
        except Exception as e:
            self.log(f"  [ERROR] Failed to run remote command on {host}: {e}")
            return None

    def confirm(self, prompt: str) -> bool:
        """Ask for confirmation unless --yes flag is set."""
        if self.yes:
            return True
        while True:
            response = input(f"{prompt} [y/N] ").strip().lower()
            if response in ("y", "yes"):
                return True
            elif response in ("n", "no", ""):
                return False
            print("Please answer 'y' or 'n'.")

    def check_llama_server(self, host: str):
        ip = FLEET[host]["ip"]
        port = FLEET[host]["port"]
        try:
            requests.get(f"http://{ip}:{port}/health", timeout=2)
        except:
            self.log(f"  [!] llama-server down on {host}.")
            if self.dry_run:
                self.log(f"  [DRY-RUN] Would restart llama-server on {host}")
            else:
                if self.confirm(f"  Restart llama-server on {host}?"):
                    self.log(f"  Restarting llama-server on {host}...")
                    self.run_remote(host, "systemctl restart llama-server")
                else:
                    self.log(f"  Skipped restart on {host}.")

    def check_disk_space(self, host: str):
        res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'")
        if res and res.returncode == 0:
            try:
                usage = int(res.stdout.strip())
                if usage > 90:
                    self.log(f"  [!] Disk usage high on {host} ({usage}%).")
                    if self.dry_run:
                        self.log(f"  [DRY-RUN] Would clean logs and vacuum journal on {host}")
                    else:
                        if self.confirm(f"  Clean logs on {host}?"):
                            self.log(f"  Cleaning logs on {host}...")
                            self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz")
                        else:
                            self.log(f"  Skipped log cleaning on {host}.")
            except:
                pass

    def check_memory(self, host: str):
        res = self.run_remote(host, "free -m | awk '/^Mem:/{print $3/$2 * 100}'")
        if res and res.returncode == 0:
            try:
                usage = float(res.stdout.strip())
                if usage > 90:
                    self.log(f"  [!] Memory usage high on {host} ({usage:.1f}%).")
                    if self.dry_run:
                        self.log(f"  [DRY-RUN] Would check for memory hogs on {host}")
                    else:
                        self.log(f"  Memory high but no automatic action defined.")
            except:
                pass

    def check_processes(self, host: str):
        # Example: check if any process uses > 80% CPU
        res = self.run_remote(host, "ps aux --sort=-%cpu | awk 'NR>1 && $3>80 {print $2, $11, $3}'")
        if res and res.returncode == 0 and res.stdout.strip():
            self.log(f"  [!] High CPU processes on {host}:")
            for line in res.stdout.strip().split('\n'):
                self.log(f"    {line}")
            if self.dry_run:
                self.log(f"  [DRY-RUN] Would review high-CPU processes on {host}")
            else:
                if self.confirm_kill:
                    if self.confirm(f"  Kill high-CPU processes on {host}? (dangerous)"):
                        # This is a placeholder; real implementation would parse PIDs
                        self.log(f"  Process killing not implemented yet (placeholder).")
                    else:
                        self.log(f"  Skipped killing processes on {host}.")
                else:
                    self.log(f"  Use --confirm-kill to enable process termination (dangerous).")

    def check_and_heal(self):
        for host in FLEET:
            self.log(f"Auditing {host}...")
            self.check_llama_server(host)
            self.check_disk_space(host)
            self.check_memory(host)
            self.check_processes(host)

    def run(self):
        if self.dry_run:
            self.log("Starting self-healing cycle (DRY-RUN mode).")
        else:
            self.log("Starting self-healing cycle (EXECUTE mode).")
        self.check_and_heal()
        self.log("Cycle complete.")

def print_help_safe():
    """Print detailed explanation of what each action does."""
    help_text = """
SAFE-BY-DEFAULT SELF-HEALING SCRIPT

This script checks fleet health and can optionally fix issues.

DEFAULT MODE: DRY-RUN (safe)
  - Only reports what it would do, does not make changes.
  - Use --execute to actually perform fixes.

CHECKS PERFORMED:
  1. llama-server health
     - Checks if llama-server is responding on each host.
     - Action: restart service (requires --execute and confirmation).

  2. Disk space
     - Checks root partition usage on each host.
     - Action: vacuum journal logs and remove rotated logs if >90% (requires --execute and confirmation).

  3. Memory usage
     - Reports high memory usage (informational only, no automatic action).

  4. Process health
     - Lists processes using >80% CPU.
     - Action: kill processes (requires --confirm-kill flag, --execute, and confirmation).

SAFETY FEATURES:
  - Dry-run by default.
  - Explicit --execute flag required for changes.
  - Confirmation prompts for all destructive actions.
  - --yes flag to skip confirmations (for automation).
  - --confirm-kill flag required to even consider killing processes.
  - Timestamps on all log messages.

EXAMPLES:
  python3 scripts/self_healing.py
    # Dry-run: safe, shows what would happen.

  python3 scripts/self_healing.py --execute
    # Actually perform fixes after confirmation.

  python3 scripts/self_healing.py --execute --yes
    # Perform fixes without prompts (automation).

  python3 scripts/self_healing.py --execute --confirm-kill
    # Allow killing processes (dangerous).

  python3 scripts/self_healing.py --help-safe
    # Show this help.
"""
    print(help_text)

def main():
    parser = argparse.ArgumentParser(
        description="Self-healing infrastructure script (safe-by-default).",
        add_help=False  # We'll handle --help ourselves
    )
    parser.add_argument("--dry-run", action="store_true", default=False,
                        help="Run in dry-run mode (default behavior).")
    parser.add_argument("--execute", action="store_true", default=False,
                        help="Actually perform fixes (disables dry-run).")
    parser.add_argument("--confirm-kill", action="store_true", default=False,
                        help="Allow killing processes (dangerous).")
    parser.add_argument("--yes", "-y", action="store_true", default=False,
                        help="Skip confirmation prompts.")
    parser.add_argument("--help-safe", action="store_true", default=False,
                        help="Show detailed help about safety features.")
    parser.add_argument("--help", "-h", action="store_true", default=False,
                        help="Show standard help.")

    args = parser.parse_args()

    if args.help_safe:
        print_help_safe()
        sys.exit(0)

    if args.help:
        parser.print_help()
        sys.exit(0)

    # Determine mode: if --execute is given, disable dry-run
    dry_run = not args.execute
    # If --dry-run is explicitly given, ensure dry-run (redundant but clear)
    if args.dry_run:
        dry_run = True

    healer = SelfHealer(dry_run=dry_run, confirm_kill=args.confirm_kill, yes=args.yes)
    healer.run()

if __name__ == "__main__":
    main()