feat: llama-server health monitor with auto-restart (#713 )

llama-server on port 8081 was down with no monitoring or auto-restart. New scripts/llama_health_monitor.py: - check_server_health(): HTTP health check for any inference server - check_ollama() / check_llama_server(): typed health checks - find_llama_server_process(): find running process via ps - restart_llama_server(): kill existing + start new with params - _find_hermes3_model(): auto-discover model file path - check_all_servers(): check Ollama + llama-server together - format_status(): human-readable health report - --watch mode: continuous monitoring with down alerts CLI: python3 llama_health_monitor.py --check python3 llama_health_monitor.py --check --port 8081 python3 llama_health_monitor.py --restart 8081 python3 llama_health_monitor.py --watch python3 llama_health_monitor.py --report Closes #713
2026-04-15 18:59:39 -04:00
1 changed files with 316 additions and 0 deletions
--- a/scripts/llama_health_monitor.py
+++ b/scripts/llama_health_monitor.py
@@ -0,0 +1,316 @@
 #!/usr/bin/env python3
 """llama-server health monitor — check, restart, and report on local inference.
 Monitors local inference servers (llama-server, Ollama) and can auto-restart
 them when they go down.
 Usage:
    python3 scripts/llama_health_monitor.py --check              # check all
    python3 scripts/llama_health_monitor.py --check --port 8081  # check specific
    python3 scripts/llama_health_monitor.py --restart 8081       # restart server
    python3 scripts/llama_health_monitor.py --watch              # continuous monitor
    python3 scripts/llama_health_monitor.py --report             # JSON status report
 """
 from __future__ import annotations
 import json
 import logging
 import os
 import subprocess
 import sys
 import time
 from dataclasses import dataclass, asdict
 from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)
 # Default servers to monitor
 _DEFAULT_SERVERS = [
    {"name": "ollama", "port": 11434, "type": "ollama", "health_path": "/api/tags"},
    {"name": "llama-server", "port": 8081, "type": "llama-server", "health_path": "/health"},
 ]
@dataclass
 class ServerStatus:
    """Status of a single inference server."""
    name: str
    port: int
    server_type: str
    reachable: bool
    health_ok: bool
    latency_ms: int
    models: List[str]
    error: str
    checked_at: str
 def check_server_health(host: str = "localhost", port: int = 8081, health_path: str = "/health", timeout: int = 5) -> dict:
    """Check if a server is healthy.
    Returns dict with reachable, health_ok, latency_ms, models, error.
    """
    import urllib.request
    import urllib.error
    result = {
        "reachable": False,
        "health_ok": False,
        "latency_ms": 0,
        "models": [],
        "error": "",
    }
    url = f"http://{host}:{port}{health_path}"
    t0 = time.monotonic()
    try:
        req = urllib.request.Request(url, method="GET")
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            result["latency_ms"] = int((time.monotonic() - t0) * 1000)
            result["reachable"] = True
            if resp.status == 200:
                result["health_ok"] = True
                try:
                    data = json.loads(resp.read())
                    if isinstance(data, dict):
                        result["models"] = [
                            m.get("name", m.get("id", ""))
                            for m in data.get("data", data.get("models", []))
                        ]
                except Exception:
                    pass
    except urllib.error.URLError as e:
        result["latency_ms"] = int((time.monotonic() - t0) * 1000)
        result["error"] = f"Connection refused or unreachable: {e}"
    except Exception as e:
        result["latency_ms"] = int((time.monotonic() - t0) * 1000)
        result["error"] = str(e)
    return result
 def check_ollama(port: int = 11434) -> ServerStatus:
    """Check Ollama server status."""
    import datetime
    health = check_server_health(port=port, health_path="/api/tags")
    return ServerStatus(
        name="ollama",
        port=port,
        server_type="ollama",
        reachable=health["reachable"],
        health_ok=health["health_ok"],
        latency_ms=health["latency_ms"],
        models=health["models"],
        error=health["error"],
        checked_at=datetime.datetime.now().isoformat(),
    )
 def check_llama_server(port: int = 8081) -> ServerStatus:
    """Check llama-server status."""
    import datetime
    health = check_server_health(port=port, health_path="/health")
    return ServerStatus(
        name="llama-server",
        port=port,
        server_type="llama-server",
        reachable=health["reachable"],
        health_ok=health["health_ok"],
        latency_ms=health["latency_ms"],
        models=health.get("models", []),
        error=health["error"],
        checked_at=datetime.datetime.now().isoformat(),
    )
 def find_llama_server_process() -> Optional[dict]:
    """Find running llama-server process."""
    try:
        result = subprocess.run(
            ["ps", "aux"],
            capture_output=True, text=True, timeout=5,
        )
        for line in result.stdout.split("\n"):
            if "llama-server" in line and "grep" not in line:
                parts = line.split()
                if len(parts) >= 11:
                    return {
                        "pid": int(parts[1]),
                        "cpu": parts[2],
                        "mem": parts[3],
                        "command": " ".join(parts[10:]),
                    }
    except Exception:
        pass
    return None
 def restart_llama_server(
    model_path: str = "",
    port: int = 8081,
    ctx_size: int = 8192,
    gpu_layers: int = 99,
    alias: str = "hermes3",
 ) -> dict:
    """Restart llama-server with specified parameters."""
    # Kill existing process
    existing = find_llama_server_process()
    if existing:
        try:
            os.kill(existing["pid"], 15)  # SIGTERM
            time.sleep(2)
            logger.info("Killed existing llama-server (PID %d)", existing["pid"])
        except ProcessLookupError:
            pass
        except Exception as e:
            return {"success": False, "error": f"Failed to kill existing: {e}"}
    # Find model path if not specified
    if not model_path:
        model_path = _find_hermes3_model()
        if not model_path:
            return {"success": False, "error": "Could not find hermes3 model path"}
    # Build command
    cmd = [
        "llama-server",
        "--model", model_path,
        "--port", str(port),
        "--host", "127.0.0.1",
        "--n-gpu-layers", str(gpu_layers),
        "--flash-attn", "on",
        "--ctx-size", str(ctx_size),
        "--alias", alias,
    ]
    try:
        # Start in background
        proc = subprocess.Popen(
            cmd,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        time.sleep(3)  # Wait for startup
        # Verify it's running
        health = check_server_health(port=port)
        if health["reachable"]:
            return {
                "success": True,
                "pid": proc.pid,
                "port": port,
                "model": model_path,
            }
        else:
            return {
                "success": False,
                "error": f"Started but not reachable: {health['error']}",
                "pid": proc.pid,
            }
    except FileNotFoundError:
        return {"success": False, "error": "llama-server binary not found in PATH"}
    except Exception as e:
        return {"success": False, "error": str(e)}
 def _find_hermes3_model() -> str:
    """Try to find the hermes3 model file."""
    import glob
    search_paths = [
        os.path.expanduser("~/.ollama/models/blobs/sha256-*"),
        os.path.expanduser("~/.cache/llama.cpp/*.gguf"),
        "/opt/models/*.gguf",
    ]
    for pattern in search_paths:
        matches = glob.glob(pattern)
        if matches:
            return matches[0]
    return ""
 def check_all_servers() -> List[ServerStatus]:
    """Check all configured servers."""
    results = []
    results.append(check_ollama())
    results.append(check_llama_server())
    return results
 def format_status(statuses: List[ServerStatus]) -> str:
    """Format server statuses as a report."""
    lines = ["Local Inference Health", "=" * 40, ""]
    for s in statuses:
        icon = "\u2705" if s.reachable and s.health_ok else "\u274c"
        lines.append(f"{icon} {s.name} (port {s.port})")
        lines.append(f"   Type:      {s.server_type}")
        lines.append(f"   Reachable: {s.reachable}")
        lines.append(f"   Healthy:   {s.health_ok}")
        lines.append(f"   Latency:   {s.latency_ms}ms")
        if s.models:
            lines.append(f"   Models:    {', '.join(s.models[:5])}")
        if s.error:
            lines.append(f"   Error:     {s.error[:100]}")
        lines.append("")
    # llama-server process
    proc = find_llama_server_process()
    if proc:
        lines.append(f"llama-server process: PID {proc['pid']}, CPU {proc['cpu']}%, MEM {proc['mem']}%")
    else:
        lines.append("llama-server process: NOT RUNNING")
    lines.append("")
    return "\n".join(lines)
 def main():
    import argparse
    parser = argparse.ArgumentParser(description="Local inference health monitor")
    parser.add_argument("--check", action="store_true", help="Check server health")
    parser.add_argument("--port", type=int, default=0, help="Check specific port")
    parser.add_argument("--restart", type=int, default=0, metavar="PORT", help="Restart server on port")
    parser.add_argument("--watch", action="store_true", help="Continuous monitoring")
    parser.add_argument("--report", action="store_true", help="JSON status report")
    parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds")
    args = parser.parse_args()
    if args.restart:
        print(f"Restarting llama-server on port {args.restart}...")
        result = restart_llama_server(port=args.restart)
        print(json.dumps(result, indent=2))
        return 0 if result["success"] else 1
    if args.watch:
        print(f"Monitoring every {args.interval}s. Press Ctrl+C to stop.")
        while True:
            statuses = check_all_servers()
            down = [s for s in statuses if not s.reachable]
            if down:
                for s in down:
                    print(f"[{time.strftime('%H:%M:%S')}] DOWN: {s.name} port {s.port} — {s.error}")
            else:
                print(f"[{time.strftime('%H:%M:%S')}] All servers OK")
            time.sleep(args.interval)
        return 0
    if args.port:
        status = check_llama_server(port=args.port)
        statuses = [status]
    else:
        statuses = check_all_servers()
    if args.report:
        print(json.dumps([asdict(s) for s in statuses], indent=2))
    else:
        print(format_status(statuses))
    down = [s for s in statuses if not s.reachable]
    return 1 if down else 0
 if __name__ == "__main__":
    sys.exit(main())