Compare commits

...

2 Commits

Author SHA1 Message Date
9ed8cd3cae feat: add auto-start server fixture (#118)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 21s
- turboquant_server_url fixture: auto-starts llama-server if no URL provided
- Finds binary in standard locations or PATH
- Finds GGUF model in standard locations
- Configurable via env vars (port, kv_type, ctx_size, timeout)
- Skips gracefully if binary or model not found
- turboquant_model_name fixture for model discovery
2026-04-21 11:52:26 +00:00
82ab8b22c3 feat: add server manager for auto-start fixture (#118) 2026-04-21 11:51:22 +00:00
2 changed files with 280 additions and 1 deletions

View File

@@ -1,3 +1,85 @@
"""Pytest configuration for turboquant."""
import sys, os
import os
import sys
import pytest
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
@pytest.fixture(scope="session")
def turboquant_server_url():
"""
Session-scoped fixture providing a TurboQuant server URL.
If TURBOQUANT_SERVER_URL is set, uses that directly.
Otherwise, auto-starts a llama-server with TurboQuant flags.
Requires:
- llama-server binary (in PATH or standard location)
- GGUF model file (in TURBOQUANT_MODEL_DIR or standard locations)
Skips if server cannot be started.
"""
# If URL already provided, use it
if os.environ.get("TURBOQUANT_SERVER_URL"):
yield os.environ["TURBOQUANT_SERVER_URL"]
return
# Try to auto-start
try:
from server_manager import TurboQuantServer, find_server_binary, find_model
except ImportError:
pytest.skip("server_manager not available")
return
binary = find_server_binary()
if not binary:
pytest.skip("llama-server binary not found — install llama-cpp-turboquant")
return
model = find_model()
if not model:
pytest.skip("No GGUF model found — set TURBOQUANT_MODEL_DIR or place model in ~/models")
return
port = int(os.environ.get("TURBOQUANT_TEST_PORT", "18081"))
kv_type = os.environ.get("TURBOQUANT_KV_TYPE", "turbo4")
ctx_size = int(os.environ.get("TURBOQUANT_CTX_SIZE", "8192"))
timeout = float(os.environ.get("TURBOQUANT_STARTUP_TIMEOUT", "60"))
server = TurboQuantServer(
model_path=model,
port=port,
kv_type=kv_type,
context_size=ctx_size,
server_binary=binary,
timeout=timeout,
)
try:
url = server.start()
yield url
except Exception as e:
pytest.skip(f"Could not start TurboQuant server: {e}")
finally:
server.stop()
@pytest.fixture(scope="session")
def turboquant_model_name(turboquant_server_url):
"""Get the model name from the running server."""
import json
import urllib.request
try:
req = urllib.request.Request(f"{turboquant_server_url}/v1/models")
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
models = data.get("data", [])
if models:
return models[0].get("id", "unknown")
except Exception:
pass
return "gemma-4"

197
tests/server_manager.py Normal file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python3
"""
TurboQuant Server Manager
Manages llama-server lifecycle for integration tests:
- Start server with TurboQuant flags
- Wait for health check
- Stop server on teardown
Usage:
from tests.server_manager import TurboQuantServer
with TurboQuantServer(model_path="/path/to/model.gguf") as server:
url = server.url # e.g. http://localhost:8081
# Run tests against server
"""
import json
import os
import signal
import subprocess
import sys
import time
import urllib.request
import urllib.error
from pathlib import Path
from typing import Optional
class TurboQuantServer:
"""Context manager for llama-server with TurboQuant."""
def __init__(
self,
model_path: str,
port: int = 8081,
kv_type: str = "turbo4",
context_size: int = 32768,
server_binary: Optional[str] = None,
timeout: float = 60.0,
host: str = "127.0.0.1",
):
self.model_path = model_path
self.port = port
self.kv_type = kv_type
self.context_size = context_size
self.timeout = timeout
self.host = host
# Find server binary
if server_binary:
self.server_binary = server_binary
else:
# Try common locations
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
Path("llama-server"), # PATH
]
self.server_binary = None
for c in candidates:
if c.exists() or c.name == "llama-server":
try:
subprocess.run([str(c), "--help"], capture_output=True, timeout=5)
self.server_binary = str(c)
break
except (FileNotFoundError, subprocess.TimeoutExpired):
continue
self.process: Optional[subprocess.Popen] = None
@property
def url(self) -> str:
return f"http://{self.host}:{self.port}"
def _build_command(self) -> list:
cmd = [
self.server_binary,
"-m", self.model_path,
"--port", str(self.port),
"--host", self.host,
"-ctk", self.kv_type,
"-ctv", self.kv_type,
"-c", str(self.context_size),
]
return cmd
def _check_health(self) -> bool:
try:
req = urllib.request.Request(f"{self.url}/v1/models")
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
return "data" in data and len(data.get("data", [])) > 0
except Exception:
return False
def start(self) -> str:
"""Start the server and wait for it to be healthy. Returns the server URL."""
if not self.server_binary:
raise RuntimeError(
"llama-server binary not found. Set server_binary or install to standard location."
)
if not Path(self.model_path).exists():
raise FileNotFoundError(f"Model not found: {self.model_path}")
cmd = self._build_command()
# Set TurboQuant env
env = os.environ.copy()
env["TURBO_LAYER_ADAPTIVE"] = "7"
self.process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env,
)
# Wait for health
start = time.time()
while time.time() - start < self.timeout:
if self.process.poll() is not None:
stderr = self.process.stderr.read().decode() if self.process.stderr else ""
raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}")
if self._check_health():
return self.url
time.sleep(1.0)
self.stop()
raise TimeoutError(f"Server did not become healthy within {self.timeout}s")
def stop(self):
"""Stop the server."""
if self.process:
try:
self.process.send_signal(signal.SIGTERM)
self.process.wait(timeout=10)
except subprocess.TimeoutExpired:
self.process.kill()
self.process.wait(timeout=5)
except Exception:
pass
self.process = None
def __enter__(self) -> "TurboQuantServer":
self.start()
return self
def __exit__(self, *args):
self.stop()
def find_server_binary() -> Optional[str]:
"""Find llama-server binary in common locations."""
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
]
for c in candidates:
if c.exists():
return str(c)
# Try PATH
try:
result = subprocess.run(["which", "llama-server"], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def find_model(model_dir: Optional[str] = None) -> Optional[str]:
"""Find a GGUF model file."""
search_dirs = [
model_dir,
os.environ.get("TURBOQUANT_MODEL_DIR"),
str(Path.home() / "models"),
"/opt/models",
"/tmp/models",
]
for d in search_dirs:
if not d:
continue
p = Path(d)
if p.is_file() and p.suffix == ".gguf":
return str(p)
if p.is_dir():
for f in sorted(p.rglob("*.gguf")):
return str(f)
return None