198 lines
5.8 KiB
Python
198 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
TurboQuant Server Manager
|
|
|
|
Manages llama-server lifecycle for integration tests:
|
|
- Start server with TurboQuant flags
|
|
- Wait for health check
|
|
- Stop server on teardown
|
|
|
|
Usage:
|
|
from tests.server_manager import TurboQuantServer
|
|
|
|
with TurboQuantServer(model_path="/path/to/model.gguf") as server:
|
|
url = server.url # e.g. http://localhost:8081
|
|
# Run tests against server
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
|
|
class TurboQuantServer:
|
|
"""Context manager for llama-server with TurboQuant."""
|
|
|
|
def __init__(
|
|
self,
|
|
model_path: str,
|
|
port: int = 8081,
|
|
kv_type: str = "turbo4",
|
|
context_size: int = 32768,
|
|
server_binary: Optional[str] = None,
|
|
timeout: float = 60.0,
|
|
host: str = "127.0.0.1",
|
|
):
|
|
self.model_path = model_path
|
|
self.port = port
|
|
self.kv_type = kv_type
|
|
self.context_size = context_size
|
|
self.timeout = timeout
|
|
self.host = host
|
|
|
|
# Find server binary
|
|
if server_binary:
|
|
self.server_binary = server_binary
|
|
else:
|
|
# Try common locations
|
|
candidates = [
|
|
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
|
|
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
|
|
Path("llama-server"), # PATH
|
|
]
|
|
self.server_binary = None
|
|
for c in candidates:
|
|
if c.exists() or c.name == "llama-server":
|
|
try:
|
|
subprocess.run([str(c), "--help"], capture_output=True, timeout=5)
|
|
self.server_binary = str(c)
|
|
break
|
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
continue
|
|
|
|
self.process: Optional[subprocess.Popen] = None
|
|
|
|
@property
|
|
def url(self) -> str:
|
|
return f"http://{self.host}:{self.port}"
|
|
|
|
def _build_command(self) -> list:
|
|
cmd = [
|
|
self.server_binary,
|
|
"-m", self.model_path,
|
|
"--port", str(self.port),
|
|
"--host", self.host,
|
|
"-ctk", self.kv_type,
|
|
"-ctv", self.kv_type,
|
|
"-c", str(self.context_size),
|
|
]
|
|
return cmd
|
|
|
|
def _check_health(self) -> bool:
|
|
try:
|
|
req = urllib.request.Request(f"{self.url}/v1/models")
|
|
resp = urllib.request.urlopen(req, timeout=5)
|
|
data = json.loads(resp.read())
|
|
return "data" in data and len(data.get("data", [])) > 0
|
|
except Exception:
|
|
return False
|
|
|
|
def start(self) -> str:
|
|
"""Start the server and wait for it to be healthy. Returns the server URL."""
|
|
if not self.server_binary:
|
|
raise RuntimeError(
|
|
"llama-server binary not found. Set server_binary or install to standard location."
|
|
)
|
|
|
|
if not Path(self.model_path).exists():
|
|
raise FileNotFoundError(f"Model not found: {self.model_path}")
|
|
|
|
cmd = self._build_command()
|
|
|
|
# Set TurboQuant env
|
|
env = os.environ.copy()
|
|
env["TURBO_LAYER_ADAPTIVE"] = "7"
|
|
|
|
self.process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
env=env,
|
|
)
|
|
|
|
# Wait for health
|
|
start = time.time()
|
|
while time.time() - start < self.timeout:
|
|
if self.process.poll() is not None:
|
|
stderr = self.process.stderr.read().decode() if self.process.stderr else ""
|
|
raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}")
|
|
|
|
if self._check_health():
|
|
return self.url
|
|
|
|
time.sleep(1.0)
|
|
|
|
self.stop()
|
|
raise TimeoutError(f"Server did not become healthy within {self.timeout}s")
|
|
|
|
def stop(self):
|
|
"""Stop the server."""
|
|
if self.process:
|
|
try:
|
|
self.process.send_signal(signal.SIGTERM)
|
|
self.process.wait(timeout=10)
|
|
except subprocess.TimeoutExpired:
|
|
self.process.kill()
|
|
self.process.wait(timeout=5)
|
|
except Exception:
|
|
pass
|
|
self.process = None
|
|
|
|
def __enter__(self) -> "TurboQuantServer":
|
|
self.start()
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
self.stop()
|
|
|
|
|
|
def find_server_binary() -> Optional[str]:
|
|
"""Find llama-server binary in common locations."""
|
|
candidates = [
|
|
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
|
|
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
|
|
]
|
|
for c in candidates:
|
|
if c.exists():
|
|
return str(c)
|
|
|
|
# Try PATH
|
|
try:
|
|
result = subprocess.run(["which", "llama-server"], capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|
|
|
|
|
|
def find_model(model_dir: Optional[str] = None) -> Optional[str]:
|
|
"""Find a GGUF model file."""
|
|
search_dirs = [
|
|
model_dir,
|
|
os.environ.get("TURBOQUANT_MODEL_DIR"),
|
|
str(Path.home() / "models"),
|
|
"/opt/models",
|
|
"/tmp/models",
|
|
]
|
|
|
|
for d in search_dirs:
|
|
if not d:
|
|
continue
|
|
p = Path(d)
|
|
if p.is_file() and p.suffix == ".gguf":
|
|
return str(p)
|
|
if p.is_dir():
|
|
for f in sorted(p.rglob("*.gguf")):
|
|
return str(f)
|
|
|
|
return None
|