turboquant/tests/server_manager.py

#!/usr/bin/env python3
"""
TurboQuant Server Manager

Manages llama-server lifecycle for integration tests:
- Start server with TurboQuant flags
- Wait for health check
- Stop server on teardown

Usage:
    from tests.server_manager import TurboQuantServer

    with TurboQuantServer(model_path="/path/to/model.gguf") as server:
        url = server.url  # e.g. http://localhost:8081
        # Run tests against server
"""

import json
import os
import signal
import subprocess
import sys
import time
import urllib.request
import urllib.error
from pathlib import Path
from typing import Optional


class TurboQuantServer:
    """Context manager for llama-server with TurboQuant."""

    def __init__(
        self,
        model_path: str,
        port: int = 8081,
        kv_type: str = "turbo4",
        context_size: int = 32768,
        server_binary: Optional[str] = None,
        timeout: float = 60.0,
        host: str = "127.0.0.1",
    ):
        self.model_path = model_path
        self.port = port
        self.kv_type = kv_type
        self.context_size = context_size
        self.timeout = timeout
        self.host = host

        # Find server binary
        if server_binary:
            self.server_binary = server_binary
        else:
            # Try common locations
            candidates = [
                Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
                Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
                Path("llama-server"),  # PATH
            ]
            self.server_binary = None
            for c in candidates:
                if c.exists() or c.name == "llama-server":
                    try:
                        subprocess.run([str(c), "--help"], capture_output=True, timeout=5)
                        self.server_binary = str(c)
                        break
                    except (FileNotFoundError, subprocess.TimeoutExpired):
                        continue

        self.process: Optional[subprocess.Popen] = None

    @property
    def url(self) -> str:
        return f"http://{self.host}:{self.port}"

    def _build_command(self) -> list:
        cmd = [
            self.server_binary,
            "-m", self.model_path,
            "--port", str(self.port),
            "--host", self.host,
            "-ctk", self.kv_type,
            "-ctv", self.kv_type,
            "-c", str(self.context_size),
        ]
        return cmd

    def _check_health(self) -> bool:
        try:
            req = urllib.request.Request(f"{self.url}/v1/models")
            resp = urllib.request.urlopen(req, timeout=5)
            data = json.loads(resp.read())
            return "data" in data and len(data.get("data", [])) > 0
        except Exception:
            return False

    def start(self) -> str:
        """Start the server and wait for it to be healthy. Returns the server URL."""
        if not self.server_binary:
            raise RuntimeError(
                "llama-server binary not found. Set server_binary or install to standard location."
            )

        if not Path(self.model_path).exists():
            raise FileNotFoundError(f"Model not found: {self.model_path}")

        cmd = self._build_command()

        # Set TurboQuant env
        env = os.environ.copy()
        env["TURBO_LAYER_ADAPTIVE"] = "7"

        self.process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            env=env,
        )

        # Wait for health
        start = time.time()
        while time.time() - start < self.timeout:
            if self.process.poll() is not None:
                stderr = self.process.stderr.read().decode() if self.process.stderr else ""
                raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}")

            if self._check_health():
                return self.url

            time.sleep(1.0)

        self.stop()
        raise TimeoutError(f"Server did not become healthy within {self.timeout}s")

    def stop(self):
        """Stop the server."""
        if self.process:
            try:
                self.process.send_signal(signal.SIGTERM)
                self.process.wait(timeout=10)
            except subprocess.TimeoutExpired:
                self.process.kill()
                self.process.wait(timeout=5)
            except Exception:
                pass
            self.process = None

    def __enter__(self) -> "TurboQuantServer":
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()


def find_server_binary() -> Optional[str]:
    """Find llama-server binary in common locations."""
    candidates = [
        Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
        Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
    ]
    for c in candidates:
        if c.exists():
            return str(c)

    # Try PATH
    try:
        result = subprocess.run(["which", "llama-server"], capture_output=True, text=True)
        if result.returncode == 0:
            return result.stdout.strip()
    except Exception:
        pass

    return None


def find_model(model_dir: Optional[str] = None) -> Optional[str]:
    """Find a GGUF model file."""
    search_dirs = [
        model_dir,
        os.environ.get("TURBOQUANT_MODEL_DIR"),
        str(Path.home() / "models"),
        "/opt/models",
        "/tmp/models",
    ]

    for d in search_dirs:
        if not d:
            continue
        p = Path(d)
        if p.is_file() and p.suffix == ".gguf":
            return str(p)
        if p.is_dir():
            for f in sorted(p.rglob("*.gguf")):
                return str(f)

    return None