Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
97df013af1 feat: llama-server health monitor with auto-restart (#713)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 25s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 44s
Tests / e2e (pull_request) Successful in 4m6s
Tests / test (pull_request) Failing after 44m24s
llama-server on port 8081 was down with no monitoring or auto-restart.

New scripts/llama_health_monitor.py:
- check_server_health(): HTTP health check for any inference server
- check_ollama() / check_llama_server(): typed health checks
- find_llama_server_process(): find running process via ps
- restart_llama_server(): kill existing + start new with params
- _find_hermes3_model(): auto-discover model file path
- check_all_servers(): check Ollama + llama-server together
- format_status(): human-readable health report
- --watch mode: continuous monitoring with down alerts

CLI:
  python3 llama_health_monitor.py --check
  python3 llama_health_monitor.py --check --port 8081
  python3 llama_health_monitor.py --restart 8081
  python3 llama_health_monitor.py --watch
  python3 llama_health_monitor.py --report

Closes #713
2026-04-15 18:59:39 -04:00

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python3
"""llama-server health monitor — check, restart, and report on local inference.
Monitors local inference servers (llama-server, Ollama) and can auto-restart
them when they go down.
Usage:
python3 scripts/llama_health_monitor.py --check # check all
python3 scripts/llama_health_monitor.py --check --port 8081 # check specific
python3 scripts/llama_health_monitor.py --restart 8081 # restart server
python3 scripts/llama_health_monitor.py --watch # continuous monitor
python3 scripts/llama_health_monitor.py --report # JSON status report
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
import sys
import time
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# Default servers to monitor
_DEFAULT_SERVERS = [
{"name": "ollama", "port": 11434, "type": "ollama", "health_path": "/api/tags"},
{"name": "llama-server", "port": 8081, "type": "llama-server", "health_path": "/health"},
]
@dataclass
class ServerStatus:
"""Status of a single inference server."""
name: str
port: int
server_type: str
reachable: bool
health_ok: bool
latency_ms: int
models: List[str]
error: str
checked_at: str
def check_server_health(host: str = "localhost", port: int = 8081, health_path: str = "/health", timeout: int = 5) -> dict:
"""Check if a server is healthy.
Returns dict with reachable, health_ok, latency_ms, models, error.
"""
import urllib.request
import urllib.error
result = {
"reachable": False,
"health_ok": False,
"latency_ms": 0,
"models": [],
"error": "",
}
url = f"http://{host}:{port}{health_path}"
t0 = time.monotonic()
try:
req = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(req, timeout=timeout) as resp:
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
result["reachable"] = True
if resp.status == 200:
result["health_ok"] = True
try:
data = json.loads(resp.read())
if isinstance(data, dict):
result["models"] = [
m.get("name", m.get("id", ""))
for m in data.get("data", data.get("models", []))
]
except Exception:
pass
except urllib.error.URLError as e:
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
result["error"] = f"Connection refused or unreachable: {e}"
except Exception as e:
result["latency_ms"] = int((time.monotonic() - t0) * 1000)
result["error"] = str(e)
return result
def check_ollama(port: int = 11434) -> ServerStatus:
"""Check Ollama server status."""
import datetime
health = check_server_health(port=port, health_path="/api/tags")
return ServerStatus(
name="ollama",
port=port,
server_type="ollama",
reachable=health["reachable"],
health_ok=health["health_ok"],
latency_ms=health["latency_ms"],
models=health["models"],
error=health["error"],
checked_at=datetime.datetime.now().isoformat(),
)
def check_llama_server(port: int = 8081) -> ServerStatus:
"""Check llama-server status."""
import datetime
health = check_server_health(port=port, health_path="/health")
return ServerStatus(
name="llama-server",
port=port,
server_type="llama-server",
reachable=health["reachable"],
health_ok=health["health_ok"],
latency_ms=health["latency_ms"],
models=health.get("models", []),
error=health["error"],
checked_at=datetime.datetime.now().isoformat(),
)
def find_llama_server_process() -> Optional[dict]:
"""Find running llama-server process."""
try:
result = subprocess.run(
["ps", "aux"],
capture_output=True, text=True, timeout=5,
)
for line in result.stdout.split("\n"):
if "llama-server" in line and "grep" not in line:
parts = line.split()
if len(parts) >= 11:
return {
"pid": int(parts[1]),
"cpu": parts[2],
"mem": parts[3],
"command": " ".join(parts[10:]),
}
except Exception:
pass
return None
def restart_llama_server(
model_path: str = "",
port: int = 8081,
ctx_size: int = 8192,
gpu_layers: int = 99,
alias: str = "hermes3",
) -> dict:
"""Restart llama-server with specified parameters."""
# Kill existing process
existing = find_llama_server_process()
if existing:
try:
os.kill(existing["pid"], 15) # SIGTERM
time.sleep(2)
logger.info("Killed existing llama-server (PID %d)", existing["pid"])
except ProcessLookupError:
pass
except Exception as e:
return {"success": False, "error": f"Failed to kill existing: {e}"}
# Find model path if not specified
if not model_path:
model_path = _find_hermes3_model()
if not model_path:
return {"success": False, "error": "Could not find hermes3 model path"}
# Build command
cmd = [
"llama-server",
"--model", model_path,
"--port", str(port),
"--host", "127.0.0.1",
"--n-gpu-layers", str(gpu_layers),
"--flash-attn", "on",
"--ctx-size", str(ctx_size),
"--alias", alias,
]
try:
# Start in background
proc = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
time.sleep(3) # Wait for startup
# Verify it's running
health = check_server_health(port=port)
if health["reachable"]:
return {
"success": True,
"pid": proc.pid,
"port": port,
"model": model_path,
}
else:
return {
"success": False,
"error": f"Started but not reachable: {health['error']}",
"pid": proc.pid,
}
except FileNotFoundError:
return {"success": False, "error": "llama-server binary not found in PATH"}
except Exception as e:
return {"success": False, "error": str(e)}
def _find_hermes3_model() -> str:
"""Try to find the hermes3 model file."""
import glob
search_paths = [
os.path.expanduser("~/.ollama/models/blobs/sha256-*"),
os.path.expanduser("~/.cache/llama.cpp/*.gguf"),
"/opt/models/*.gguf",
]
for pattern in search_paths:
matches = glob.glob(pattern)
if matches:
return matches[0]
return ""
def check_all_servers() -> List[ServerStatus]:
"""Check all configured servers."""
results = []
results.append(check_ollama())
results.append(check_llama_server())
return results
def format_status(statuses: List[ServerStatus]) -> str:
"""Format server statuses as a report."""
lines = ["Local Inference Health", "=" * 40, ""]
for s in statuses:
icon = "\u2705" if s.reachable and s.health_ok else "\u274c"
lines.append(f"{icon} {s.name} (port {s.port})")
lines.append(f" Type: {s.server_type}")
lines.append(f" Reachable: {s.reachable}")
lines.append(f" Healthy: {s.health_ok}")
lines.append(f" Latency: {s.latency_ms}ms")
if s.models:
lines.append(f" Models: {', '.join(s.models[:5])}")
if s.error:
lines.append(f" Error: {s.error[:100]}")
lines.append("")
# llama-server process
proc = find_llama_server_process()
if proc:
lines.append(f"llama-server process: PID {proc['pid']}, CPU {proc['cpu']}%, MEM {proc['mem']}%")
else:
lines.append("llama-server process: NOT RUNNING")
lines.append("")
return "\n".join(lines)
def main():
import argparse
parser = argparse.ArgumentParser(description="Local inference health monitor")
parser.add_argument("--check", action="store_true", help="Check server health")
parser.add_argument("--port", type=int, default=0, help="Check specific port")
parser.add_argument("--restart", type=int, default=0, metavar="PORT", help="Restart server on port")
parser.add_argument("--watch", action="store_true", help="Continuous monitoring")
parser.add_argument("--report", action="store_true", help="JSON status report")
parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds")
args = parser.parse_args()
if args.restart:
print(f"Restarting llama-server on port {args.restart}...")
result = restart_llama_server(port=args.restart)
print(json.dumps(result, indent=2))
return 0 if result["success"] else 1
if args.watch:
print(f"Monitoring every {args.interval}s. Press Ctrl+C to stop.")
while True:
statuses = check_all_servers()
down = [s for s in statuses if not s.reachable]
if down:
for s in down:
print(f"[{time.strftime('%H:%M:%S')}] DOWN: {s.name} port {s.port}{s.error}")
else:
print(f"[{time.strftime('%H:%M:%S')}] All servers OK")
time.sleep(args.interval)
return 0
if args.port:
status = check_llama_server(port=args.port)
statuses = [status]
else:
statuses = check_all_servers()
if args.report:
print(json.dumps([asdict(s) for s in statuses], indent=2))
else:
print(format_status(statuses))
down = [s for s in statuses if not s.reachable]
return 1 if down else 0
if __name__ == "__main__":
sys.exit(main())