Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97df013af1 |
316
scripts/llama_health_monitor.py
Normal file
316
scripts/llama_health_monitor.py
Normal file
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python3
|
||||
"""llama-server health monitor — check, restart, and report on local inference.
|
||||
|
||||
Monitors local inference servers (llama-server, Ollama) and can auto-restart
|
||||
them when they go down.
|
||||
|
||||
Usage:
|
||||
python3 scripts/llama_health_monitor.py --check # check all
|
||||
python3 scripts/llama_health_monitor.py --check --port 8081 # check specific
|
||||
python3 scripts/llama_health_monitor.py --restart 8081 # restart server
|
||||
python3 scripts/llama_health_monitor.py --watch # continuous monitor
|
||||
python3 scripts/llama_health_monitor.py --report # JSON status report
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default servers to monitor
|
||||
_DEFAULT_SERVERS = [
|
||||
{"name": "ollama", "port": 11434, "type": "ollama", "health_path": "/api/tags"},
|
||||
{"name": "llama-server", "port": 8081, "type": "llama-server", "health_path": "/health"},
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServerStatus:
|
||||
"""Status of a single inference server."""
|
||||
name: str
|
||||
port: int
|
||||
server_type: str
|
||||
reachable: bool
|
||||
health_ok: bool
|
||||
latency_ms: int
|
||||
models: List[str]
|
||||
error: str
|
||||
checked_at: str
|
||||
|
||||
|
||||
def check_server_health(host: str = "localhost", port: int = 8081, health_path: str = "/health", timeout: int = 5) -> dict:
|
||||
"""Check if a server is healthy.
|
||||
|
||||
Returns dict with reachable, health_ok, latency_ms, models, error.
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
result = {
|
||||
"reachable": False,
|
||||
"health_ok": False,
|
||||
"latency_ms": 0,
|
||||
"models": [],
|
||||
"error": "",
|
||||
}
|
||||
|
||||
url = f"http://{host}:{port}{health_path}"
|
||||
t0 = time.monotonic()
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, method="GET")
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
result["reachable"] = True
|
||||
|
||||
if resp.status == 200:
|
||||
result["health_ok"] = True
|
||||
try:
|
||||
data = json.loads(resp.read())
|
||||
if isinstance(data, dict):
|
||||
result["models"] = [
|
||||
m.get("name", m.get("id", ""))
|
||||
for m in data.get("data", data.get("models", []))
|
||||
]
|
||||
except Exception:
|
||||
pass
|
||||
except urllib.error.URLError as e:
|
||||
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
result["error"] = f"Connection refused or unreachable: {e}"
|
||||
except Exception as e:
|
||||
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
result["error"] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_ollama(port: int = 11434) -> ServerStatus:
|
||||
"""Check Ollama server status."""
|
||||
import datetime
|
||||
health = check_server_health(port=port, health_path="/api/tags")
|
||||
return ServerStatus(
|
||||
name="ollama",
|
||||
port=port,
|
||||
server_type="ollama",
|
||||
reachable=health["reachable"],
|
||||
health_ok=health["health_ok"],
|
||||
latency_ms=health["latency_ms"],
|
||||
models=health["models"],
|
||||
error=health["error"],
|
||||
checked_at=datetime.datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
|
||||
def check_llama_server(port: int = 8081) -> ServerStatus:
|
||||
"""Check llama-server status."""
|
||||
import datetime
|
||||
health = check_server_health(port=port, health_path="/health")
|
||||
return ServerStatus(
|
||||
name="llama-server",
|
||||
port=port,
|
||||
server_type="llama-server",
|
||||
reachable=health["reachable"],
|
||||
health_ok=health["health_ok"],
|
||||
latency_ms=health["latency_ms"],
|
||||
models=health.get("models", []),
|
||||
error=health["error"],
|
||||
checked_at=datetime.datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
|
||||
def find_llama_server_process() -> Optional[dict]:
|
||||
"""Find running llama-server process."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ps", "aux"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in result.stdout.split("\n"):
|
||||
if "llama-server" in line and "grep" not in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 11:
|
||||
return {
|
||||
"pid": int(parts[1]),
|
||||
"cpu": parts[2],
|
||||
"mem": parts[3],
|
||||
"command": " ".join(parts[10:]),
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def restart_llama_server(
|
||||
model_path: str = "",
|
||||
port: int = 8081,
|
||||
ctx_size: int = 8192,
|
||||
gpu_layers: int = 99,
|
||||
alias: str = "hermes3",
|
||||
) -> dict:
|
||||
"""Restart llama-server with specified parameters."""
|
||||
# Kill existing process
|
||||
existing = find_llama_server_process()
|
||||
if existing:
|
||||
try:
|
||||
os.kill(existing["pid"], 15) # SIGTERM
|
||||
time.sleep(2)
|
||||
logger.info("Killed existing llama-server (PID %d)", existing["pid"])
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
except Exception as e:
|
||||
return {"success": False, "error": f"Failed to kill existing: {e}"}
|
||||
|
||||
# Find model path if not specified
|
||||
if not model_path:
|
||||
model_path = _find_hermes3_model()
|
||||
if not model_path:
|
||||
return {"success": False, "error": "Could not find hermes3 model path"}
|
||||
|
||||
# Build command
|
||||
cmd = [
|
||||
"llama-server",
|
||||
"--model", model_path,
|
||||
"--port", str(port),
|
||||
"--host", "127.0.0.1",
|
||||
"--n-gpu-layers", str(gpu_layers),
|
||||
"--flash-attn", "on",
|
||||
"--ctx-size", str(ctx_size),
|
||||
"--alias", alias,
|
||||
]
|
||||
|
||||
try:
|
||||
# Start in background
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
time.sleep(3) # Wait for startup
|
||||
|
||||
# Verify it's running
|
||||
health = check_server_health(port=port)
|
||||
if health["reachable"]:
|
||||
return {
|
||||
"success": True,
|
||||
"pid": proc.pid,
|
||||
"port": port,
|
||||
"model": model_path,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Started but not reachable: {health['error']}",
|
||||
"pid": proc.pid,
|
||||
}
|
||||
except FileNotFoundError:
|
||||
return {"success": False, "error": "llama-server binary not found in PATH"}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
def _find_hermes3_model() -> str:
|
||||
"""Try to find the hermes3 model file."""
|
||||
import glob
|
||||
search_paths = [
|
||||
os.path.expanduser("~/.ollama/models/blobs/sha256-*"),
|
||||
os.path.expanduser("~/.cache/llama.cpp/*.gguf"),
|
||||
"/opt/models/*.gguf",
|
||||
]
|
||||
for pattern in search_paths:
|
||||
matches = glob.glob(pattern)
|
||||
if matches:
|
||||
return matches[0]
|
||||
return ""
|
||||
|
||||
|
||||
def check_all_servers() -> List[ServerStatus]:
|
||||
"""Check all configured servers."""
|
||||
results = []
|
||||
results.append(check_ollama())
|
||||
results.append(check_llama_server())
|
||||
return results
|
||||
|
||||
|
||||
def format_status(statuses: List[ServerStatus]) -> str:
|
||||
"""Format server statuses as a report."""
|
||||
lines = ["Local Inference Health", "=" * 40, ""]
|
||||
|
||||
for s in statuses:
|
||||
icon = "\u2705" if s.reachable and s.health_ok else "\u274c"
|
||||
lines.append(f"{icon} {s.name} (port {s.port})")
|
||||
lines.append(f" Type: {s.server_type}")
|
||||
lines.append(f" Reachable: {s.reachable}")
|
||||
lines.append(f" Healthy: {s.health_ok}")
|
||||
lines.append(f" Latency: {s.latency_ms}ms")
|
||||
if s.models:
|
||||
lines.append(f" Models: {', '.join(s.models[:5])}")
|
||||
if s.error:
|
||||
lines.append(f" Error: {s.error[:100]}")
|
||||
lines.append("")
|
||||
|
||||
# llama-server process
|
||||
proc = find_llama_server_process()
|
||||
if proc:
|
||||
lines.append(f"llama-server process: PID {proc['pid']}, CPU {proc['cpu']}%, MEM {proc['mem']}%")
|
||||
else:
|
||||
lines.append("llama-server process: NOT RUNNING")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Local inference health monitor")
|
||||
parser.add_argument("--check", action="store_true", help="Check server health")
|
||||
parser.add_argument("--port", type=int, default=0, help="Check specific port")
|
||||
parser.add_argument("--restart", type=int, default=0, metavar="PORT", help="Restart server on port")
|
||||
parser.add_argument("--watch", action="store_true", help="Continuous monitoring")
|
||||
parser.add_argument("--report", action="store_true", help="JSON status report")
|
||||
parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.restart:
|
||||
print(f"Restarting llama-server on port {args.restart}...")
|
||||
result = restart_llama_server(port=args.restart)
|
||||
print(json.dumps(result, indent=2))
|
||||
return 0 if result["success"] else 1
|
||||
|
||||
if args.watch:
|
||||
print(f"Monitoring every {args.interval}s. Press Ctrl+C to stop.")
|
||||
while True:
|
||||
statuses = check_all_servers()
|
||||
down = [s for s in statuses if not s.reachable]
|
||||
if down:
|
||||
for s in down:
|
||||
print(f"[{time.strftime('%H:%M:%S')}] DOWN: {s.name} port {s.port} — {s.error}")
|
||||
else:
|
||||
print(f"[{time.strftime('%H:%M:%S')}] All servers OK")
|
||||
time.sleep(args.interval)
|
||||
return 0
|
||||
|
||||
if args.port:
|
||||
status = check_llama_server(port=args.port)
|
||||
statuses = [status]
|
||||
else:
|
||||
statuses = check_all_servers()
|
||||
|
||||
if args.report:
|
||||
print(json.dumps([asdict(s) for s in statuses], indent=2))
|
||||
else:
|
||||
print(format_status(statuses))
|
||||
|
||||
down = [s for s in statuses if not s.reachable]
|
||||
return 1 if down else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user