feat: llama-server health monitor with auto-restart (#713 )

llama-server on port 8081 was down with no monitoring or auto-restart. New scripts/llama_health_monitor.py: - check_server_health(): HTTP health check for any inference server - check_ollama() / check_llama_server(): typed health checks - find_llama_server_process(): find running process via ps - restart_llama_server(): kill existing + start new with params - _find_hermes3_model(): auto-discover model file path - check_all_servers(): check Ollama + llama-server together - format_status(): human-readable health report - --watch mode: continuous monitoring with down alerts CLI: python3 llama_health_monitor.py --check python3 llama_health_monitor.py --check --port 8081 python3 llama_health_monitor.py --restart 8081 python3 llama_health_monitor.py --watch python3 llama_health_monitor.py --report Closes #713
2026-04-15 18:59:39 -04:00
1 changed files with 316 additions and 0 deletions
--- a/scripts/llama_health_monitor.py
+++ b/scripts/llama_health_monitor.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+"""llama-server health monitor — check, restart, and report on local inference.
+
+Monitors local inference servers (llama-server, Ollama) and can auto-restart
+them when they go down.
+
+Usage:
+    python3 scripts/llama_health_monitor.py --check              # check all
+    python3 scripts/llama_health_monitor.py --check --port 8081  # check specific
+    python3 scripts/llama_health_monitor.py --restart 8081       # restart server
+    python3 scripts/llama_health_monitor.py --watch              # continuous monitor
+    python3 scripts/llama_health_monitor.py --report             # JSON status report
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# Default servers to monitor
+_DEFAULT_SERVERS = [
+    {"name": "ollama", "port": 11434, "type": "ollama", "health_path": "/api/tags"},
+    {"name": "llama-server", "port": 8081, "type": "llama-server", "health_path": "/health"},
+]
+
+
+@dataclass
+class ServerStatus:
+    """Status of a single inference server."""
+    name: str
+    port: int
+    server_type: str
+    reachable: bool
+    health_ok: bool
+    latency_ms: int
+    models: List[str]
+    error: str
+    checked_at: str
+
+
+def check_server_health(host: str = "localhost", port: int = 8081, health_path: str = "/health", timeout: int = 5) -> dict:
+    """Check if a server is healthy.
+
+    Returns dict with reachable, health_ok, latency_ms, models, error.
+    """
+    import urllib.request
+    import urllib.error
+
+    result = {
+        "reachable": False,
+        "health_ok": False,
+        "latency_ms": 0,
+        "models": [],
+        "error": "",
+    }
+
+    url = f"http://{host}:{port}{health_path}"
+    t0 = time.monotonic()
+
+    try:
+        req = urllib.request.Request(url, method="GET")
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            result["latency_ms"] = int((time.monotonic() - t0) * 1000)
+            result["reachable"] = True
+
+            if resp.status == 200:
+                result["health_ok"] = True
+                try:
+                    data = json.loads(resp.read())
+                    if isinstance(data, dict):
+                        result["models"] = [
+                            m.get("name", m.get("id", ""))
+                            for m in data.get("data", data.get("models", []))
+                        ]
+                except Exception:
+                    pass
+    except urllib.error.URLError as e:
+        result["latency_ms"] = int((time.monotonic() - t0) * 1000)
+        result["error"] = f"Connection refused or unreachable: {e}"
+    except Exception as e:
+        result["latency_ms"] = int((time.monotonic() - t0) * 1000)
+        result["error"] = str(e)
+
+    return result
+
+
+def check_ollama(port: int = 11434) -> ServerStatus:
+    """Check Ollama server status."""
+    import datetime
+    health = check_server_health(port=port, health_path="/api/tags")
+    return ServerStatus(
+        name="ollama",
+        port=port,
+        server_type="ollama",
+        reachable=health["reachable"],
+        health_ok=health["health_ok"],
+        latency_ms=health["latency_ms"],
+        models=health["models"],
+        error=health["error"],
+        checked_at=datetime.datetime.now().isoformat(),
+    )
+
+
+def check_llama_server(port: int = 8081) -> ServerStatus:
+    """Check llama-server status."""
+    import datetime
+    health = check_server_health(port=port, health_path="/health")
+    return ServerStatus(
+        name="llama-server",
+        port=port,
+        server_type="llama-server",
+        reachable=health["reachable"],
+        health_ok=health["health_ok"],
+        latency_ms=health["latency_ms"],
+        models=health.get("models", []),
+        error=health["error"],
+        checked_at=datetime.datetime.now().isoformat(),
+    )
+
+
+def find_llama_server_process() -> Optional[dict]:
+    """Find running llama-server process."""
+    try:
+        result = subprocess.run(
+            ["ps", "aux"],
+            capture_output=True, text=True, timeout=5,
+        )
+        for line in result.stdout.split("\n"):
+            if "llama-server" in line and "grep" not in line:
+                parts = line.split()
+                if len(parts) >= 11:
+                    return {
+                        "pid": int(parts[1]),
+                        "cpu": parts[2],
+                        "mem": parts[3],
+                        "command": " ".join(parts[10:]),
+                    }
+    except Exception:
+        pass
+    return None
+
+
+def restart_llama_server(
+    model_path: str = "",
+    port: int = 8081,
+    ctx_size: int = 8192,
+    gpu_layers: int = 99,
+    alias: str = "hermes3",
+) -> dict:
+    """Restart llama-server with specified parameters."""
+    # Kill existing process
+    existing = find_llama_server_process()
+    if existing:
+        try:
+            os.kill(existing["pid"], 15)  # SIGTERM
+            time.sleep(2)
+            logger.info("Killed existing llama-server (PID %d)", existing["pid"])
+        except ProcessLookupError:
+            pass
+        except Exception as e:
+            return {"success": False, "error": f"Failed to kill existing: {e}"}
+
+    # Find model path if not specified
+    if not model_path:
+        model_path = _find_hermes3_model()
+        if not model_path:
+            return {"success": False, "error": "Could not find hermes3 model path"}
+
+    # Build command
+    cmd = [
+        "llama-server",
+        "--model", model_path,
+        "--port", str(port),
+        "--host", "127.0.0.1",
+        "--n-gpu-layers", str(gpu_layers),
+        "--flash-attn", "on",
+        "--ctx-size", str(ctx_size),
+        "--alias", alias,
+    ]
+
+    try:
+        # Start in background
+        proc = subprocess.Popen(
+            cmd,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        time.sleep(3)  # Wait for startup
+
+        # Verify it's running
+        health = check_server_health(port=port)
+        if health["reachable"]:
+            return {
+                "success": True,
+                "pid": proc.pid,
+                "port": port,
+                "model": model_path,
+            }
+        else:
+            return {
+                "success": False,
+                "error": f"Started but not reachable: {health['error']}",
+                "pid": proc.pid,
+            }
+    except FileNotFoundError:
+        return {"success": False, "error": "llama-server binary not found in PATH"}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+
+
+def _find_hermes3_model() -> str:
+    """Try to find the hermes3 model file."""
+    import glob
+    search_paths = [
+        os.path.expanduser("~/.ollama/models/blobs/sha256-*"),
+        os.path.expanduser("~/.cache/llama.cpp/*.gguf"),
+        "/opt/models/*.gguf",
+    ]
+    for pattern in search_paths:
+        matches = glob.glob(pattern)
+        if matches:
+            return matches[0]
+    return ""
+
+
+def check_all_servers() -> List[ServerStatus]:
+    """Check all configured servers."""
+    results = []
+    results.append(check_ollama())
+    results.append(check_llama_server())
+    return results
+
+
+def format_status(statuses: List[ServerStatus]) -> str:
+    """Format server statuses as a report."""
+    lines = ["Local Inference Health", "=" * 40, ""]
+
+    for s in statuses:
+        icon = "\u2705" if s.reachable and s.health_ok else "\u274c"
+        lines.append(f"{icon} {s.name} (port {s.port})")
+        lines.append(f"   Type:      {s.server_type}")
+        lines.append(f"   Reachable: {s.reachable}")
+        lines.append(f"   Healthy:   {s.health_ok}")
+        lines.append(f"   Latency:   {s.latency_ms}ms")
+        if s.models:
+            lines.append(f"   Models:    {', '.join(s.models[:5])}")
+        if s.error:
+            lines.append(f"   Error:     {s.error[:100]}")
+        lines.append("")
+
+    # llama-server process
+    proc = find_llama_server_process()
+    if proc:
+        lines.append(f"llama-server process: PID {proc['pid']}, CPU {proc['cpu']}%, MEM {proc['mem']}%")
+    else:
+        lines.append("llama-server process: NOT RUNNING")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Local inference health monitor")
+    parser.add_argument("--check", action="store_true", help="Check server health")
+    parser.add_argument("--port", type=int, default=0, help="Check specific port")
+    parser.add_argument("--restart", type=int, default=0, metavar="PORT", help="Restart server on port")
+    parser.add_argument("--watch", action="store_true", help="Continuous monitoring")
+    parser.add_argument("--report", action="store_true", help="JSON status report")
+    parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds")
+    args = parser.parse_args()
+
+    if args.restart:
+        print(f"Restarting llama-server on port {args.restart}...")
+        result = restart_llama_server(port=args.restart)
+        print(json.dumps(result, indent=2))
+        return 0 if result["success"] else 1
+
+    if args.watch:
+        print(f"Monitoring every {args.interval}s. Press Ctrl+C to stop.")
+        while True:
+            statuses = check_all_servers()
+            down = [s for s in statuses if not s.reachable]
+            if down:
+                for s in down:
+                    print(f"[{time.strftime('%H:%M:%S')}] DOWN: {s.name} port {s.port} — {s.error}")
+            else:
+                print(f"[{time.strftime('%H:%M:%S')}] All servers OK")
+            time.sleep(args.interval)
+        return 0
+
+    if args.port:
+        status = check_llama_server(port=args.port)
+        statuses = [status]
+    else:
+        statuses = check_all_servers()
+
+    if args.report:
+        print(json.dumps([asdict(s) for s in statuses], indent=2))
+    else:
+        print(format_status(statuses))
+
+    down = [s for s in statuses if not s.reachable]
+    return 1 if down else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())