timmy-config/scripts/fleet_llama.py

#!/usr/bin/env python3
"""
[OPS] llama.cpp Fleet Manager
Part of the Gemini Sovereign Infrastructure Suite.

Manages llama-server instances across the Timmy Foundation fleet.
Supports status, restart, and model swapping via SSH.
"""

import os
import sys
import json
import argparse
import requests
from typing import Dict, List, Any

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
if SCRIPT_DIR not in sys.path:
    sys.path.insert(0, SCRIPT_DIR)

from ssh_trust import VerifiedSSHExecutor

# --- FLEET DEFINITION ---
FLEET = {
    "mac": {"ip": "10.1.10.77", "port": 8080, "role": "hub"},
    "ezra": {"ip": "143.198.27.163", "port": 8080, "role": "forge"},
    "allegro": {"ip": "167.99.126.228", "port": 8080, "role": "agent-host"},
    "bezalel": {"ip": "159.203.146.185", "port": 8080, "role": "world-host"}
}

class FleetManager:
    def __init__(self, executor=None):
        self.results = {}
        self.executor = executor or VerifiedSSHExecutor()

    def run_remote(self, host: str, command: str):
        ip = FLEET[host]["ip"]
        ssh_cmd = [
            "ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
            f"root@{ip}", command
        ]
        # For Mac, we might need a different user or local execution
        if host == "mac":
            ssh_cmd = ["bash", "-c", command]

        try:
            result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10)
            return result
        except subprocess.TimeoutExpired:
            return None
        except Exception as e:
            print(f"Error running remote command on {host}: {e}")
            return None

    def get_status(self, host: str):
        ip = FLEET[host]["ip"]
        port = FLEET[host]["port"]

        status = {"online": False, "server_running": False, "model": "unknown", "tps": 0.0}

        # 1. Check if machine is reachable
        ping_res = subprocess.run(["ping", "-c", "1", "-W", "1", ip], capture_output=True)
        if ping_res.returncode == 0:
            status["online"] = True

            # 2. Check if llama-server is responding to health check
            try:
                url = f"http://{ip}:{port}/health"
                response = requests.get(url, timeout=2)
                if response.status_code == 200:
                    status["server_running"] = True
                    data = response.json()
                    # llama.cpp health endpoint usually returns slots info
                    # We'll try to get model info if available
                    status["model"] = data.get("model", "unknown")
            except:
                pass

        return status

    def show_fleet_status(self):
        print(f"{'NAME':<10} {'IP':<15} {'STATUS':<10} {'SERVER':<10} {'MODEL':<20}")
        print("-" * 70)
        for name in FLEET:
            status = self.get_status(name)
            online_str = "✅" if status["online"] else "❌"
            server_str = "🚀" if status["server_running"] else "💤"
            print(f"{name:<10} {FLEET[name]['ip']:<15} {online_str:<10} {server_str:<10} {status['model']:<20}")

    def restart_server(self, host: str):
        print(f"[*] Restarting llama-server on {host}...")
        res = self.run_remote(host, "systemctl restart llama-server")
        if res and res.returncode == 0:
            print(f"[SUCCESS] Restarted {host}")
        else:
            print(f"[FAILURE] Could not restart {host}")

    def swap_model(self, host: str, model_name: str):
        print(f"[*] Swapping model on {host} to {model_name}...")
        # This assumes the provision_wizard.py structure
        # In a real scenario, we'd have a mapping of model names to URLs
        # For now, we'll just update the systemd service or a config file

        # 1. Stop server
        self.run_remote(host, "systemctl stop llama-server")

        # 2. Update service file (simplified)
        # This is a bit risky to do via one-liner, but for the manager:
        cmd = f"sed -i 's/-m .*\\.gguf/-m \\/opt\\/models\\/{model_name}.gguf/' /etc/systemd/system/llama-server.service"
        self.run_remote(host, cmd)

        # 3. Start server
        self.run_remote(host, "systemctl daemon-reload && systemctl start llama-server")
        print(f"[SUCCESS] Swapped model on {host}")

def main():
    parser = argparse.ArgumentParser(description="Gemini Fleet Manager")
    subparsers = parser.add_subparsers(dest="command")

    subparsers.add_parser("status", help="Show fleet status")

    restart_parser = subparsers.add_parser("restart", help="Restart a server")
    restart_parser.add_argument("host", choices=list(FLEET.keys()), help="Host to restart")

    swap_parser = subparsers.add_parser("swap", help="Swap model on a host")
    swap_parser.add_argument("host", choices=list(FLEET.keys()), help="Host to swap")
    swap_parser.add_argument("model", help="Model name (GGUF)")

    args = parser.parse_args()

    manager = FleetManager()

    if args.command == "status":
        manager.show_fleet_status()
    elif args.command == "restart":
        manager.restart_server(args.host)
    elif args.command == "swap":
        manager.swap_model(args.host, args.model)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()