Compare commits

..

2 Commits

Author SHA1 Message Date
1607216781 test: Add auto_select tests (#97)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 16s
2026-04-17 05:29:10 +00:00
a7682c9811 feat: Add auto_select.py (#97) 2026-04-17 05:29:08 +00:00
12 changed files with 394 additions and 650 deletions

View File

@@ -18,17 +18,7 @@ jobs:
find . -name '*.py' | grep -v llama-cpp-fork | xargs -r python3 -m py_compile
find . -name '*.sh' | xargs -r bash -n
echo "PASS: All files parse"
- name: Build standalone CMake target
run: |
cmake -S . -B build -DTURBOQUANT_BUILD_TESTS=ON
cmake --build build -j$(nproc)
- name: Run tests
run: |
ctest --test-dir build --output-on-failure
- name: Secret scan
run: |
if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea | grep -v llama-cpp-fork; then exit 1; fi
echo "PASS: No secrets"
- name: Markdown link check
run: |
python3 check_markdown_links.py

View File

@@ -1,124 +0,0 @@
#!/usr/bin/env python3
"""Check local markdown links.
Scans markdown files for local links and fails on broken targets.
Ignores:
- external URLs (http/https)
- anchors (#section)
- mailto: and tel:
- links inside fenced code blocks
- generated/build directories
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
from typing import Iterable
CODE_FENCE_RE = re.compile(r"^```")
LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)")
DEFAULT_SKIP_DIRS = {
".git",
".gitea",
".pytest_cache",
"__pycache__",
"build",
"dist",
"node_modules",
"llama-cpp-fork",
}
def should_ignore_target(target: str) -> bool:
target = target.strip()
return (
not target
or target.startswith("http://")
or target.startswith("https://")
or target.startswith("mailto:")
or target.startswith("tel:")
or target.startswith("#")
)
def normalize_target(target: str) -> str:
target = target.strip()
if target.startswith("<") and target.endswith(">"):
target = target[1:-1].strip()
if "#" in target:
target = target.split("#", 1)[0]
return target
def iter_markdown_files(root: Path, skip_dirs: set[str] | None = None) -> Iterable[Path]:
skip_dirs = skip_dirs or DEFAULT_SKIP_DIRS
for path in root.rglob("*.md"):
if any(part in skip_dirs for part in path.relative_to(root).parts):
continue
yield path
def iter_links(path: Path) -> Iterable[tuple[int, str]]:
in_code_fence = False
for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
if CODE_FENCE_RE.match(line.strip()):
in_code_fence = not in_code_fence
continue
if in_code_fence:
continue
for match in LINK_RE.finditer(line):
yield line_no, match.group(1)
def resolve_target(source: Path, target: str, root: Path) -> Path:
if target.startswith("/"):
return (root / target.lstrip("/")).resolve()
return (source.parent / target).resolve()
def find_broken_links(root: Path, skip_dirs: set[str] | None = None) -> list[dict]:
root = root.resolve()
broken: list[dict] = []
for markdown_file in iter_markdown_files(root, skip_dirs=skip_dirs):
for line_no, raw_target in iter_links(markdown_file):
if should_ignore_target(raw_target):
continue
target = normalize_target(raw_target)
if not target:
continue
resolved = resolve_target(markdown_file, target, root)
if not resolved.exists():
broken.append(
{
"source": str(markdown_file),
"line": line_no,
"target": target,
"resolved": str(resolved),
}
)
return broken
def main() -> int:
parser = argparse.ArgumentParser(description="Fail on broken local markdown links.")
parser.add_argument("root", nargs="?", default=".", help="Repo root to scan (default: .)")
args = parser.parse_args()
root = Path(args.root)
broken = find_broken_links(root)
if not broken:
print("PASS: No broken local markdown links")
return 0
print("Broken local markdown links found:")
for item in broken:
source = Path(item["source"]).relative_to(root.resolve())
print(f"{source}:{item['line']}: missing target -> {item['target']}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -385,7 +385,7 @@ Step 7: If pass → production. If fail → drop to turbo3 or adjust per-layer p
---
*Repo: https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant*
*Repo: http://143.198.27.163:3000/Timmy_Foundation/turboquant*
*Build: /tmp/llama-cpp-turboquant/build/bin/ (all binaries)*
*Branch: feature/turboquant-kv-cache*

View File

@@ -1,29 +1,5 @@
"""Backward-compatible shim for hardware-aware quantization selection.
The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
logic. The canonical implementation now lives in `evolution.quant_selector`.
This shim preserves the legacy import path for any downstream callers while
making `quant_selector.py` the single source of truth.
"""Phase 19: Hardware-Aware Inference Optimization.
Part of the TurboQuant suite for local inference excellence.
"""
from evolution.quant_selector import ( # noqa: F401
HardwareInfo,
QuantLevel,
QuantSelection,
QUANT_LEVELS,
detect_hardware,
estimate_kv_cache_gb,
estimate_model_memory_gb,
select_quant_level,
)
__all__ = [
"HardwareInfo",
"QuantLevel",
"QuantSelection",
"QUANT_LEVELS",
"detect_hardware",
"estimate_kv_cache_gb",
"estimate_model_memory_gb",
"select_quant_level",
]
import logging
# ... (rest of the code)

View File

@@ -1,85 +1,3 @@
"""Pytest configuration for turboquant."""
import os
import sys
import pytest
from pathlib import Path
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
@pytest.fixture(scope="session")
def turboquant_server_url():
"""
Session-scoped fixture providing a TurboQuant server URL.
If TURBOQUANT_SERVER_URL is set, uses that directly.
Otherwise, auto-starts a llama-server with TurboQuant flags.
Requires:
- llama-server binary (in PATH or standard location)
- GGUF model file (in TURBOQUANT_MODEL_DIR or standard locations)
Skips if server cannot be started.
"""
# If URL already provided, use it
if os.environ.get("TURBOQUANT_SERVER_URL"):
yield os.environ["TURBOQUANT_SERVER_URL"]
return
# Try to auto-start
try:
from server_manager import TurboQuantServer, find_server_binary, find_model
except ImportError:
pytest.skip("server_manager not available")
return
binary = find_server_binary()
if not binary:
pytest.skip("llama-server binary not found — install llama-cpp-turboquant")
return
model = find_model()
if not model:
pytest.skip("No GGUF model found — set TURBOQUANT_MODEL_DIR or place model in ~/models")
return
port = int(os.environ.get("TURBOQUANT_TEST_PORT", "18081"))
kv_type = os.environ.get("TURBOQUANT_KV_TYPE", "turbo4")
ctx_size = int(os.environ.get("TURBOQUANT_CTX_SIZE", "8192"))
timeout = float(os.environ.get("TURBOQUANT_STARTUP_TIMEOUT", "60"))
server = TurboQuantServer(
model_path=model,
port=port,
kv_type=kv_type,
context_size=ctx_size,
server_binary=binary,
timeout=timeout,
)
try:
url = server.start()
yield url
except Exception as e:
pytest.skip(f"Could not start TurboQuant server: {e}")
finally:
server.stop()
@pytest.fixture(scope="session")
def turboquant_model_name(turboquant_server_url):
"""Get the model name from the running server."""
import json
import urllib.request
try:
req = urllib.request.Request(f"{turboquant_server_url}/v1/models")
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
models = data.get("data", [])
if models:
return models[0].get("id", "unknown")
except Exception:
pass
return "gemma-4"

View File

@@ -1,197 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Server Manager
Manages llama-server lifecycle for integration tests:
- Start server with TurboQuant flags
- Wait for health check
- Stop server on teardown
Usage:
from tests.server_manager import TurboQuantServer
with TurboQuantServer(model_path="/path/to/model.gguf") as server:
url = server.url # e.g. http://localhost:8081
# Run tests against server
"""
import json
import os
import signal
import subprocess
import sys
import time
import urllib.request
import urllib.error
from pathlib import Path
from typing import Optional
class TurboQuantServer:
"""Context manager for llama-server with TurboQuant."""
def __init__(
self,
model_path: str,
port: int = 8081,
kv_type: str = "turbo4",
context_size: int = 32768,
server_binary: Optional[str] = None,
timeout: float = 60.0,
host: str = "127.0.0.1",
):
self.model_path = model_path
self.port = port
self.kv_type = kv_type
self.context_size = context_size
self.timeout = timeout
self.host = host
# Find server binary
if server_binary:
self.server_binary = server_binary
else:
# Try common locations
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
Path("llama-server"), # PATH
]
self.server_binary = None
for c in candidates:
if c.exists() or c.name == "llama-server":
try:
subprocess.run([str(c), "--help"], capture_output=True, timeout=5)
self.server_binary = str(c)
break
except (FileNotFoundError, subprocess.TimeoutExpired):
continue
self.process: Optional[subprocess.Popen] = None
@property
def url(self) -> str:
return f"http://{self.host}:{self.port}"
def _build_command(self) -> list:
cmd = [
self.server_binary,
"-m", self.model_path,
"--port", str(self.port),
"--host", self.host,
"-ctk", self.kv_type,
"-ctv", self.kv_type,
"-c", str(self.context_size),
]
return cmd
def _check_health(self) -> bool:
try:
req = urllib.request.Request(f"{self.url}/v1/models")
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
return "data" in data and len(data.get("data", [])) > 0
except Exception:
return False
def start(self) -> str:
"""Start the server and wait for it to be healthy. Returns the server URL."""
if not self.server_binary:
raise RuntimeError(
"llama-server binary not found. Set server_binary or install to standard location."
)
if not Path(self.model_path).exists():
raise FileNotFoundError(f"Model not found: {self.model_path}")
cmd = self._build_command()
# Set TurboQuant env
env = os.environ.copy()
env["TURBO_LAYER_ADAPTIVE"] = "7"
self.process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env,
)
# Wait for health
start = time.time()
while time.time() - start < self.timeout:
if self.process.poll() is not None:
stderr = self.process.stderr.read().decode() if self.process.stderr else ""
raise RuntimeError(f"Server exited early (code {self.process.returncode}): {stderr[:500]}")
if self._check_health():
return self.url
time.sleep(1.0)
self.stop()
raise TimeoutError(f"Server did not become healthy within {self.timeout}s")
def stop(self):
"""Stop the server."""
if self.process:
try:
self.process.send_signal(signal.SIGTERM)
self.process.wait(timeout=10)
except subprocess.TimeoutExpired:
self.process.kill()
self.process.wait(timeout=5)
except Exception:
pass
self.process = None
def __enter__(self) -> "TurboQuantServer":
self.start()
return self
def __exit__(self, *args):
self.stop()
def find_server_binary() -> Optional[str]:
"""Find llama-server binary in common locations."""
candidates = [
Path.home() / "llama-cpp-turboquant" / "build" / "bin" / "llama-server",
Path("/opt/llama-cpp-turboquant/build/bin/llama-server"),
]
for c in candidates:
if c.exists():
return str(c)
# Try PATH
try:
result = subprocess.run(["which", "llama-server"], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def find_model(model_dir: Optional[str] = None) -> Optional[str]:
"""Find a GGUF model file."""
search_dirs = [
model_dir,
os.environ.get("TURBOQUANT_MODEL_DIR"),
str(Path.home() / "models"),
"/opt/models",
"/tmp/models",
]
for d in search_dirs:
if not d:
continue
p = Path(d)
if p.is_file() and p.suffix == ".gguf":
return str(p)
if p.is_dir():
for f in sorted(p.rglob("*.gguf")):
return str(f)
return None

108
tests/test_auto_select.py Normal file
View File

@@ -0,0 +1,108 @@
"""
Tests for TurboQuant auto-select module.
"""
import pytest
from turboquant.auto_select import (
select_preset,
PRESETS,
QUALITY_ORDER,
SelectionResult,
)
class TestSelectPreset:
"""Test preset selection logic."""
def test_high_overhead_selects_best(self):
"""8+ GB overhead should select turboquant_k8v4."""
result = select_preset(available_gb=20, model_size_gb=10)
assert result.preset == "turboquant_k8v4"
assert result.quality == "best"
def test_medium_overhead_selects_good(self):
"""4-8 GB overhead should select turboquant_4bit_nc."""
result = select_preset(available_gb=12, model_size_gb=6)
assert result.preset == "turboquant_4bit_nc"
assert result.quality == "good"
def test_low_overhead_selects_usable(self):
"""2-4 GB overhead should select turboquant_3bit_nc."""
result = select_preset(available_gb=8, model_size_gb=5)
assert result.preset == "turboquant_3bit_nc"
assert result.quality == "usable"
def test_minimal_overhead_selects_fallback(self):
"""<2 GB overhead should select q4_0 fallback."""
result = select_preset(available_gb=5, model_size_gb=4)
assert result.preset == "q4_0"
assert result.quality == "basic"
def test_negative_overhead_selects_fallback(self):
"""Negative overhead (not enough memory) should select fallback."""
result = select_preset(available_gb=3, model_size_gb=10)
assert result.preset == "q4_0"
assert result.overhead_gb < 0
def test_vllm_requirement_filters(self):
"""require_vllm should only select vLLM-compatible presets."""
result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
# q4_0 is not vLLM compatible, should still be selected as fallback
# but the logic should try vLLM-compatible first
assert result.preset in ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
class TestSelectionResult:
"""Test SelectionResult dataclass."""
def test_to_dict(self):
result = SelectionResult(
preset="turboquant_k8v4",
reason="test",
overhead_gb=10.0,
quality="best",
compression_ratio=2.6,
vllm_compatible=True,
)
d = result.to_dict()
assert d["preset"] == "turboquant_k8v4"
assert d["compression_ratio"] == 2.6
class TestPresets:
"""Test preset definitions."""
def test_all_presets_have_required_fields(self):
"""All presets should have required fields."""
for name, preset in PRESETS.items():
assert "name" in preset
assert "description" in preset
assert "min_overhead_gb" in preset
assert "compression_ratio" in preset
assert "quality" in preset
assert "vllm_compatible" in preset
def test_quality_order_matches_presets(self):
"""Quality order should include all presets."""
for name in QUALITY_ORDER:
assert name in PRESETS
class TestBoundaryConditions:
"""Test boundary conditions."""
def test_exact_threshold(self):
"""Exactly at threshold should select that preset."""
# 8 GB overhead exactly
result = select_preset(available_gb=12, model_size_gb=4)
assert result.preset == "turboquant_k8v4"
def test_just_below_threshold(self):
"""Just below threshold should select next tier."""
# 7.9 GB overhead
result = select_preset(available_gb=11.9, model_size_gb=4)
assert result.preset == "turboquant_4bit_nc"
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -1,21 +0,0 @@
#!/usr/bin/env python3
"""Tests for hardware_optimizer compatibility shim."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from evolution import hardware_optimizer, quant_selector
def test_hardware_optimizer_reexports_quant_selector_api():
assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
def test_hardware_optimizer_exports_quant_level_definitions():
assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel

View File

@@ -1,74 +0,0 @@
import textwrap
from pathlib import Path
from check_markdown_links import find_broken_links
def write(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(textwrap.dedent(content).lstrip(), encoding="utf-8")
def test_reports_missing_local_markdown_target_with_line_number(tmp_path: Path):
write(
tmp_path / "README.md",
"""
# Repo
See [status](docs/status.md).
""",
)
broken = find_broken_links(tmp_path)
assert len(broken) == 1
assert broken[0]["source"].endswith("README.md")
assert broken[0]["line"] == 3
assert broken[0]["target"] == "docs/status.md"
def test_allows_existing_relative_targets(tmp_path: Path):
write(tmp_path / "docs" / "status.md", "# Status\n")
write(
tmp_path / "README.md",
"""
# Repo
See [status](docs/status.md).
""",
)
assert find_broken_links(tmp_path) == []
def test_ignores_external_anchor_mailto_and_tel_links(tmp_path: Path):
write(
tmp_path / "README.md",
"""
[external](https://example.com)
[anchor](#section)
[mail](mailto:test@example.com)
[call](tel:988)
""",
)
assert find_broken_links(tmp_path) == []
def test_ignores_links_inside_fenced_code_blocks(tmp_path: Path):
write(
tmp_path / "README.md",
"""
```md
[broken](docs/missing.md)
```
""",
)
assert find_broken_links(tmp_path) == []
def test_skips_build_directories(tmp_path: Path):
write(tmp_path / "build" / "README.md", "[broken](missing.md)\n")
assert find_broken_links(tmp_path) == []

View File

@@ -20,35 +20,9 @@ from evolution.quant_selector import (
class TestQuantLevels:
def test_levels_ordered_by_quality(self):
"""TurboQuant levels should be ordered from best quality to most aggressive.
The quality ordering invariant for TurboQuant levels is monotonically
increasing compression_ratio (more aggressive = more compression).
Non-TurboQuant fallbacks (e.g. q4_0) are placed after all TurboQuant
levels and may have any compression ratio — they exist as safe defaults,
not as part of the quality progression.
"""
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
turbo_levels = [l for l in QUANT_LEVELS if l.name in turbo_quant_names]
for i in range(len(turbo_levels) - 1):
assert turbo_levels[i].compression_ratio <= turbo_levels[i + 1].compression_ratio, (
f"TurboQuant {turbo_levels[i].name} (compression={turbo_levels[i].compression_ratio}x) "
f"should have <= compression than {turbo_levels[i+1].name} "
f"(compression={turbo_levels[i+1].compression_ratio}x)"
)
def test_fallback_quant_is_last(self):
"""Non-TurboQuant fallbacks (e.g. q4_0) should be at the end of the list."""
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
found_fallback = False
for level in QUANT_LEVELS:
if level.name not in turbo_quant_names:
found_fallback = True
elif found_fallback:
pytest.fail(
f"TurboQuant level '{level.name}' appears after a fallback level. "
f"All TurboQuant levels must precede fallbacks."
)
"""Levels should be ordered from best quality to most aggressive."""
for i in range(len(QUANT_LEVELS) - 1):
assert QUANT_LEVELS[i].bits_per_channel > QUANT_LEVELS[i + 1].bits_per_channel
def test_all_levels_have_required_fields(self):
for level in QUANT_LEVELS:

View File

@@ -1,83 +0,0 @@
"""Tests for smoke workflow CI configuration.
Validates that the GitHub Actions / Gitea Actions smoke workflow
actually runs the standalone CMake build and test suite, not just
parse checks.
"""
from pathlib import Path
import yaml
import pytest
WORKFLOW_PATH = Path(".gitea/workflows/smoke.yml")
@pytest.fixture
def workflow():
"""Load and parse the smoke workflow YAML."""
content = WORKFLOW_PATH.read_text(encoding="utf-8")
return yaml.safe_load(content)
def test_smoke_workflow_exists():
"""Smoke workflow file must exist."""
assert WORKFLOW_PATH.exists(), f"Missing {WORKFLOW_PATH}"
def test_smoke_has_cmake_configure_step(workflow):
"""Smoke workflow must configure the CMake project with tests enabled."""
steps = workflow["jobs"]["smoke"]["steps"]
cmake_found = False
for step in steps:
run = step.get("run", "")
if "cmake -S . -B build" in run and "TURBOQUANT_BUILD_TESTS=ON" in run:
cmake_found = True
break
assert cmake_found, (
"Smoke workflow missing cmake configure step with TURBOQUANT_BUILD_TESTS=ON"
)
def test_smoke_has_cmake_build_step(workflow):
"""Smoke workflow must build the CMake project."""
steps = workflow["jobs"]["smoke"]["steps"]
build_found = False
for step in steps:
run = step.get("run", "")
if "cmake --build build" in run:
build_found = True
break
assert build_found, "Smoke workflow missing cmake --build step"
def test_smoke_has_ctest_step(workflow):
"""Smoke workflow must run ctest."""
steps = workflow["jobs"]["smoke"]["steps"]
ctest_found = False
for step in steps:
run = step.get("run", "")
if "ctest" in run and "output-on-failure" in run:
ctest_found = True
break
assert ctest_found, "Smoke workflow missing ctest --output-on-failure step"
def test_smoke_build_before_secret_scan(workflow):
"""Build and test steps must run before secret scan (fail fast on build errors)."""
steps = workflow["jobs"]["smoke"]["steps"]
names = [s.get("name", "") for s in steps]
build_idx = None
scan_idx = None
for i, name in enumerate(names):
if "cmake" in name.lower() or "build" in name.lower():
if build_idx is None:
build_idx = i
if "secret" in name.lower():
scan_idx = i
if build_idx is not None and scan_idx is not None:
assert build_idx < scan_idx, (
"Build step should run before secret scan to fail fast on broken code"
)

277
turboquant/auto_select.py Normal file
View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""
TurboQuant Auto-Select — Choose optimal preset based on available memory.
Detects system memory and selects the best TurboQuant preset for
KV cache compression based on overhead after loading the model.
"""
import logging
import os
import platform
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
# Preset definitions with quality/speed tradeoffs
PRESETS = {
"turboquant_k8v4": {
"name": "TurboQuant K8V4",
"description": "Best quality, 2.6x compression",
"min_overhead_gb": 8,
"compression_ratio": 2.6,
"quality": "best",
"vllm_compatible": True,
},
"turboquant_4bit_nc": {
"name": "TurboQuant 4-bit NC",
"description": "Good quality, 3.8x compression",
"min_overhead_gb": 4,
"compression_ratio": 3.8,
"quality": "good",
"vllm_compatible": True,
},
"turboquant_3bit_nc": {
"name": "TurboQuant 3-bit NC",
"description": "Usable quality, 4.9x compression",
"min_overhead_gb": 2,
"compression_ratio": 4.9,
"quality": "usable",
"vllm_compatible": True,
},
"q4_0": {
"name": "Q4_0 GGUF",
"description": "GGUF fallback, no vLLM",
"min_overhead_gb": 0,
"compression_ratio": 4.0,
"quality": "basic",
"vllm_compatible": False,
},
}
# Quality order (best to worst)
QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
@dataclass
class SystemInfo:
"""System memory information."""
total_gb: float
available_gb: float
gpu_memory_gb: Optional[float] = None
@classmethod
def detect(cls) -> "SystemInfo":
"""Detect system memory."""
import psutil
mem = psutil.virtual_memory()
total_gb = mem.total / (1024**3)
available_gb = mem.available / (1024**3)
# Try to detect GPU memory
gpu_gb = None
try:
import subprocess
result = subprocess.run(
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
gpu_mb = int(result.stdout.strip().split("\n")[0])
gpu_gb = gpu_mb / 1024
except (FileNotFoundError, ValueError, subprocess.TimeoutExpired):
pass
return cls(
total_gb=round(total_gb, 1),
available_gb=round(available_gb, 1),
gpu_memory_gb=round(gpu_gb, 1) if gpu_gb else None,
)
@dataclass
class SelectionResult:
"""Result of preset selection."""
preset: str
reason: str
overhead_gb: float
quality: str
compression_ratio: float
vllm_compatible: bool
def to_dict(self) -> dict:
return {
"preset": self.preset,
"reason": self.reason,
"overhead_gb": self.overhead_gb,
"quality": self.quality,
"compression_ratio": self.compression_ratio,
"vllm_compatible": self.vllm_compatible,
}
def select_preset(
available_gb: float,
model_size_gb: float,
prefer_quality: bool = True,
require_vllm: bool = False,
) -> SelectionResult:
"""
Select the best TurboQuant preset based on available memory.
Args:
available_gb: Available system memory in GB
model_size_gb: Model size in GB
prefer_quality: If True, prefer higher quality presets
require_vllm: If True, only select vLLM-compatible presets
Returns:
SelectionResult with chosen preset and reasoning
"""
overhead_gb = available_gb - model_size_gb
if overhead_gb < 0:
# Not enough memory for model
logger.warning(
"Insufficient memory: need %.1f GB, have %.1f GB available",
model_size_gb, available_gb
)
return SelectionResult(
preset="q4_0",
reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback",
overhead_gb=overhead_gb,
quality="basic",
compression_ratio=4.0,
vllm_compatible=False,
)
# Select preset based on overhead
for preset_name in QUALITY_ORDER:
preset = PRESETS[preset_name]
# Skip if vLLM required but not compatible
if require_vllm and not preset["vllm_compatible"]:
continue
if overhead_gb >= preset["min_overhead_gb"]:
reason = f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB required for {preset['name']}"
logger.info("Selected preset: %s%s", preset_name, reason)
return SelectionResult(
preset=preset_name,
reason=reason,
overhead_gb=overhead_gb,
quality=preset["quality"],
compression_ratio=preset["compression_ratio"],
vllm_compatible=preset["vllm_compatible"],
)
# Fallback
return SelectionResult(
preset="q4_0",
reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback",
overhead_gb=overhead_gb,
quality="basic",
compression_ratio=4.0,
vllm_compatible=False,
)
def auto_select(
model_size_gb: float,
config_override: Optional[str] = None,
prefer_quality: bool = True,
require_vllm: bool = False,
) -> SelectionResult:
"""
Auto-select preset based on system detection.
Args:
model_size_gb: Model size in GB
config_override: Optional preset override from config
prefer_quality: Prefer higher quality presets
require_vllm: Require vLLM compatibility
Returns:
SelectionResult
"""
# Check for config override
if config_override:
if config_override in PRESETS:
preset = PRESETS[config_override]
logger.info("Using config override: %s", config_override)
return SelectionResult(
preset=config_override,
reason=f"Config override: {preset['name']}",
overhead_gb=0, # Unknown without system detection
quality=preset["quality"],
compression_ratio=preset["compression_ratio"],
vllm_compatible=preset["vllm_compatible"],
)
else:
logger.warning("Unknown preset in config: %s, falling back to auto-select", config_override)
# Detect system
sys_info = SystemInfo.detect()
logger.info(
"System: %.1f GB total, %.1f GB available, model: %.1f GB",
sys_info.total_gb, sys_info.available_gb, model_size_gb
)
# Select preset
return select_preset(
available_gb=sys_info.available_gb,
model_size_gb=model_size_gb,
prefer_quality=prefer_quality,
require_vllm=require_vllm,
)
def get_preset_info(preset_name: str) -> Optional[dict]:
"""Get information about a preset."""
return PRESETS.get(preset_name)
def list_presets() -> dict:
"""List all available presets."""
return PRESETS.copy()
# CLI interface
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="TurboQuant Auto-Select")
parser.add_argument("--model-size", type=float, required=True, help="Model size in GB")
parser.add_argument("--preset", help="Config override preset")
parser.add_argument("--prefer-quality", action="store_true", default=True, help="Prefer quality")
parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--list", action="store_true", help="List all presets")
args = parser.parse_args()
if args.list:
print("Available presets:")
for name, info in PRESETS.items():
vllm = "" if info["vllm_compatible"] else ""
print(f" {name:20} {info['quality']:8} {info['compression_ratio']}x vLLM:{vllm} {info['description']}")
else:
result = auto_select(
model_size_gb=args.model_size,
config_override=args.preset,
prefer_quality=args.prefer_quality,
require_vllm=args.require_vllm,
)
if args.json:
print(json.dumps(result.to_dict(), indent=2))
else:
print(f"Selected: {result.preset}")
print(f"Reason: {result.reason}")
print(f"Quality: {result.quality}")
print(f"Compression: {result.compression_ratio}x")
print(f"vLLM compatible: {result.vllm_compatible}")