diff --git a/tests/server_manager.py b/tests/server_manager.py new file mode 100644 index 00000000..93039ab4 --- /dev/null +++ b/tests/server_manager.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +""" +TurboQuant Server Manager + +Manages llama-server lifecycle for integration tests: +- Start server with TurboQuant flags +- Wait for health check +- Stop server on teardown + +Usage: + from tests.server_manager import TurboQuantServer + + with TurboQuantServer(model_path="/path/to/model.gguf") as server: + url = server.url # e.g. http://localhost:8081 + # Run tests against server +""" + +import json +import os +import signal +import subprocess +import sys +import time +import urllib.request +import urllib.error +from pathlib import Path +from typing import Optional + + +class TurboQuantServer: + """Context manager for llama-server with TurboQuant.""" + + def __init__( + self, + model_path: str, + port: int = 8081, + kv_type: str = "turbo4", + context_size: int = 32768, + server_binary: Optional[str] = None, + timeout: float = 60.0, + host: str = "127.0.0.1", + ): + self.model_path = model_path + self.port = port + self.kv_type = kv_type + self.context_size = context_size + self.timeout = timeout + self.host = host + + # Find server binary + if server_binary: + self.server_binary = server_binary + else: + # Try common locations + candidates = [ + Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server", + Path("/opt/llama-cpp-turboquant/build/bin/llama-server"), + Path("llama-server"), # PATH + ] + self.server_binary = None + for c in candidates: + if c.exists() or c.name == "llama-server": + try: + subprocess.run([str(c), "--help"], capture_output=True, timeout=5) + self.server_binary = str(c) + break + except (FileNotFoundError, subprocess.TimeoutExpired): + continue + + self.process: Optional[subprocess.Popen] = None + + @property + def url(self) -> str: + return f"http://{self.host}:{self.port}" + + def _build_command(self) -> list: + cmd = [ + self.server_binary, + "-m", self.model_path, + "--port", str(self.port), + "--host", self.host, + "-ctk", self.kv_type, + "-ctv", self.kv_type, + "-c", str(self.context_size), + ] + return cmd + + def _check_health(self) -> bool: + try: + req = urllib.request.Request(f"{self.url}/v1/models") + resp = urllib.request.urlopen(req, timeout=5) + data = json.loads(resp.read()) + return "data" in data and len(data.get("data", [])) > 0 + except Exception: + return False + + def start(self) -> str: + """Start the server and wait for it to be healthy. Returns the server URL.""" + if not self.server_binary: + raise RuntimeError( + "llama-server binary not found. Set server_binary or install to standard location." + ) + + if not Path(self.model_path).exists(): + raise FileNotFoundError(f"Model not found: {self.model_path}") + + cmd = self._build_command() + + # Set TurboQuant env + env = os.environ.copy() + env["TURBO_LAYER_ADAPTIVE"] = "7" + + self.process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + + # Wait for health + start = time.time() + while time.time() - start < self.timeout: + if self.process.poll() is not None: + stderr = self.process.stderr.read().decode() if self.process.stderr else "" + raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}") + + if self._check_health(): + return self.url + + time.sleep(1.0) + + self.stop() + raise TimeoutError(f"Server did not become healthy within {self.timeout}s") + + def stop(self): + """Stop the server.""" + if self.process: + try: + self.process.send_signal(signal.SIGTERM) + self.process.wait(timeout=10) + except subprocess.TimeoutExpired: + self.process.kill() + self.process.wait(timeout=5) + except Exception: + pass + self.process = None + + def __enter__(self) -> "TurboQuantServer": + self.start() + return self + + def __exit__(self, *args): + self.stop() + + +def find_server_binary() -> Optional[str]: + """Find llama-server binary in common locations.""" + candidates = [ + Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server", + Path("/opt/llama-cpp-turboquant/build/bin/llama-server"), + ] + for c in candidates: + if c.exists(): + return str(c) + + # Try PATH + try: + result = subprocess.run(["which", "llama-server"], capture_output=True, text=True) + if result.returncode == 0: + return result.stdout.strip() + except Exception: + pass + + return None + + +def find_model(model_dir: Optional[str] = None) -> Optional[str]: + """Find a GGUF model file.""" + search_dirs = [ + model_dir, + os.environ.get("TURBOQUANT_MODEL_DIR"), + str(Path.home() / "models"), + "/opt/models", + "/tmp/models", + ] + + for d in search_dirs: + if not d: + continue + p = Path(d) + if p.is_file() and p.suffix == ".gguf": + return str(p) + if p.is_dir(): + for f in sorted(p.rglob("*.gguf")): + return str(f) + + return None