Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
a537511652 refactor: consolidate hardware optimizer with quant selector (#92)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 17s
2026-04-20 20:38:56 -04:00
4 changed files with 49 additions and 389 deletions

View File

@@ -1,5 +1,29 @@
"""Phase 19: Hardware-Aware Inference Optimization.
Part of the TurboQuant suite for local inference excellence.
"""Backward-compatible shim for hardware-aware quantization selection.
The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
logic. The canonical implementation now lives in `evolution.quant_selector`.
This shim preserves the legacy import path for any downstream callers while
making `quant_selector.py` the single source of truth.
"""
import logging
# ... (rest of the code)
from evolution.quant_selector import ( # noqa: F401
HardwareInfo,
QuantLevel,
QuantSelection,
QUANT_LEVELS,
detect_hardware,
estimate_kv_cache_gb,
estimate_model_memory_gb,
select_quant_level,
)
__all__ = [
"HardwareInfo",
"QuantLevel",
"QuantSelection",
"QUANT_LEVELS",
"detect_hardware",
"estimate_kv_cache_gb",
"estimate_model_memory_gb",
"select_quant_level",
]

View File

@@ -1,108 +0,0 @@
"""
Tests for TurboQuant auto-select module.
"""
import pytest
from turboquant.auto_select import (
select_preset,
PRESETS,
QUALITY_ORDER,
SelectionResult,
)
class TestSelectPreset:
"""Test preset selection logic."""
def test_high_overhead_selects_best(self):
"""8+ GB overhead should select turboquant_k8v4."""
result = select_preset(available_gb=20, model_size_gb=10)
assert result.preset == "turboquant_k8v4"
assert result.quality == "best"
def test_medium_overhead_selects_good(self):
"""4-8 GB overhead should select turboquant_4bit_nc."""
result = select_preset(available_gb=12, model_size_gb=6)
assert result.preset == "turboquant_4bit_nc"
assert result.quality == "good"
def test_low_overhead_selects_usable(self):
"""2-4 GB overhead should select turboquant_3bit_nc."""
result = select_preset(available_gb=8, model_size_gb=5)
assert result.preset == "turboquant_3bit_nc"
assert result.quality == "usable"
def test_minimal_overhead_selects_fallback(self):
"""<2 GB overhead should select q4_0 fallback."""
result = select_preset(available_gb=5, model_size_gb=4)
assert result.preset == "q4_0"
assert result.quality == "basic"
def test_negative_overhead_selects_fallback(self):
"""Negative overhead (not enough memory) should select fallback."""
result = select_preset(available_gb=3, model_size_gb=10)
assert result.preset == "q4_0"
assert result.overhead_gb < 0
def test_vllm_requirement_filters(self):
"""require_vllm should only select vLLM-compatible presets."""
result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
# q4_0 is not vLLM compatible, should still be selected as fallback
# but the logic should try vLLM-compatible first
assert result.preset in ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
class TestSelectionResult:
"""Test SelectionResult dataclass."""
def test_to_dict(self):
result = SelectionResult(
preset="turboquant_k8v4",
reason="test",
overhead_gb=10.0,
quality="best",
compression_ratio=2.6,
vllm_compatible=True,
)
d = result.to_dict()
assert d["preset"] == "turboquant_k8v4"
assert d["compression_ratio"] == 2.6
class TestPresets:
"""Test preset definitions."""
def test_all_presets_have_required_fields(self):
"""All presets should have required fields."""
for name, preset in PRESETS.items():
assert "name" in preset
assert "description" in preset
assert "min_overhead_gb" in preset
assert "compression_ratio" in preset
assert "quality" in preset
assert "vllm_compatible" in preset
def test_quality_order_matches_presets(self):
"""Quality order should include all presets."""
for name in QUALITY_ORDER:
assert name in PRESETS
class TestBoundaryConditions:
"""Test boundary conditions."""
def test_exact_threshold(self):
"""Exactly at threshold should select that preset."""
# 8 GB overhead exactly
result = select_preset(available_gb=12, model_size_gb=4)
assert result.preset == "turboquant_k8v4"
def test_just_below_threshold(self):
"""Just below threshold should select next tier."""
# 7.9 GB overhead
result = select_preset(available_gb=11.9, model_size_gb=4)
assert result.preset == "turboquant_4bit_nc"
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
"""Tests for hardware_optimizer compatibility shim."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from evolution import hardware_optimizer, quant_selector
def test_hardware_optimizer_reexports_quant_selector_api():
assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
def test_hardware_optimizer_exports_quant_level_definitions():
assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel

View File

@@ -1,277 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Auto-Select — Choose optimal preset based on available memory.
Detects system memory and selects the best TurboQuant preset for
KV cache compression based on overhead after loading the model.
"""
import logging
import os
import platform
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
# Preset definitions with quality/speed tradeoffs
PRESETS = {
"turboquant_k8v4": {
"name": "TurboQuant K8V4",
"description": "Best quality, 2.6x compression",
"min_overhead_gb": 8,
"compression_ratio": 2.6,
"quality": "best",
"vllm_compatible": True,
},
"turboquant_4bit_nc": {
"name": "TurboQuant 4-bit NC",
"description": "Good quality, 3.8x compression",
"min_overhead_gb": 4,
"compression_ratio": 3.8,
"quality": "good",
"vllm_compatible": True,
},
"turboquant_3bit_nc": {
"name": "TurboQuant 3-bit NC",
"description": "Usable quality, 4.9x compression",
"min_overhead_gb": 2,
"compression_ratio": 4.9,
"quality": "usable",
"vllm_compatible": True,
},
"q4_0": {
"name": "Q4_0 GGUF",
"description": "GGUF fallback, no vLLM",
"min_overhead_gb": 0,
"compression_ratio": 4.0,
"quality": "basic",
"vllm_compatible": False,
},
}
# Quality order (best to worst)
QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
@dataclass
class SystemInfo:
"""System memory information."""
total_gb: float
available_gb: float
gpu_memory_gb: Optional[float] = None
@classmethod
def detect(cls) -> "SystemInfo":
"""Detect system memory."""
import psutil
mem = psutil.virtual_memory()
total_gb = mem.total / (1024**3)
available_gb = mem.available / (1024**3)
# Try to detect GPU memory
gpu_gb = None
try:
import subprocess
result = subprocess.run(
["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
gpu_mb = int(result.stdout.strip().split("\n")[0])
gpu_gb = gpu_mb / 1024
except (FileNotFoundError, ValueError, subprocess.TimeoutExpired):
pass
return cls(
total_gb=round(total_gb, 1),
available_gb=round(available_gb, 1),
gpu_memory_gb=round(gpu_gb, 1) if gpu_gb else None,
)
@dataclass
class SelectionResult:
"""Result of preset selection."""
preset: str
reason: str
overhead_gb: float
quality: str
compression_ratio: float
vllm_compatible: bool
def to_dict(self) -> dict:
return {
"preset": self.preset,
"reason": self.reason,
"overhead_gb": self.overhead_gb,
"quality": self.quality,
"compression_ratio": self.compression_ratio,
"vllm_compatible": self.vllm_compatible,
}
def select_preset(
available_gb: float,
model_size_gb: float,
prefer_quality: bool = True,
require_vllm: bool = False,
) -> SelectionResult:
"""
Select the best TurboQuant preset based on available memory.
Args:
available_gb: Available system memory in GB
model_size_gb: Model size in GB
prefer_quality: If True, prefer higher quality presets
require_vllm: If True, only select vLLM-compatible presets
Returns:
SelectionResult with chosen preset and reasoning
"""
overhead_gb = available_gb - model_size_gb
if overhead_gb < 0:
# Not enough memory for model
logger.warning(
"Insufficient memory: need %.1f GB, have %.1f GB available",
model_size_gb, available_gb
)
return SelectionResult(
preset="q4_0",
reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback",
overhead_gb=overhead_gb,
quality="basic",
compression_ratio=4.0,
vllm_compatible=False,
)
# Select preset based on overhead
for preset_name in QUALITY_ORDER:
preset = PRESETS[preset_name]
# Skip if vLLM required but not compatible
if require_vllm and not preset["vllm_compatible"]:
continue
if overhead_gb >= preset["min_overhead_gb"]:
reason = f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB required for {preset['name']}"
logger.info("Selected preset: %s%s", preset_name, reason)
return SelectionResult(
preset=preset_name,
reason=reason,
overhead_gb=overhead_gb,
quality=preset["quality"],
compression_ratio=preset["compression_ratio"],
vllm_compatible=preset["vllm_compatible"],
)
# Fallback
return SelectionResult(
preset="q4_0",
reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback",
overhead_gb=overhead_gb,
quality="basic",
compression_ratio=4.0,
vllm_compatible=False,
)
def auto_select(
model_size_gb: float,
config_override: Optional[str] = None,
prefer_quality: bool = True,
require_vllm: bool = False,
) -> SelectionResult:
"""
Auto-select preset based on system detection.
Args:
model_size_gb: Model size in GB
config_override: Optional preset override from config
prefer_quality: Prefer higher quality presets
require_vllm: Require vLLM compatibility
Returns:
SelectionResult
"""
# Check for config override
if config_override:
if config_override in PRESETS:
preset = PRESETS[config_override]
logger.info("Using config override: %s", config_override)
return SelectionResult(
preset=config_override,
reason=f"Config override: {preset['name']}",
overhead_gb=0, # Unknown without system detection
quality=preset["quality"],
compression_ratio=preset["compression_ratio"],
vllm_compatible=preset["vllm_compatible"],
)
else:
logger.warning("Unknown preset in config: %s, falling back to auto-select", config_override)
# Detect system
sys_info = SystemInfo.detect()
logger.info(
"System: %.1f GB total, %.1f GB available, model: %.1f GB",
sys_info.total_gb, sys_info.available_gb, model_size_gb
)
# Select preset
return select_preset(
available_gb=sys_info.available_gb,
model_size_gb=model_size_gb,
prefer_quality=prefer_quality,
require_vllm=require_vllm,
)
def get_preset_info(preset_name: str) -> Optional[dict]:
"""Get information about a preset."""
return PRESETS.get(preset_name)
def list_presets() -> dict:
"""List all available presets."""
return PRESETS.copy()
# CLI interface
if __name__ == "__main__":
import argparse
import json
parser = argparse.ArgumentParser(description="TurboQuant Auto-Select")
parser.add_argument("--model-size", type=float, required=True, help="Model size in GB")
parser.add_argument("--preset", help="Config override preset")
parser.add_argument("--prefer-quality", action="store_true", default=True, help="Prefer quality")
parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument("--list", action="store_true", help="List all presets")
args = parser.parse_args()
if args.list:
print("Available presets:")
for name, info in PRESETS.items():
vllm = "" if info["vllm_compatible"] else ""
print(f" {name:20} {info['quality']:8} {info['compression_ratio']}x vLLM:{vllm} {info['description']}")
else:
result = auto_select(
model_size_gb=args.model_size,
config_override=args.preset,
prefer_quality=args.prefer_quality,
require_vllm=args.require_vllm,
)
if args.json:
print(json.dumps(result.to_dict(), indent=2))
else:
print(f"Selected: {result.preset}")
print(f"Reason: {result.reason}")
print(f"Quality: {result.quality}")
print(f"Compression: {result.compression_ratio}x")
print(f"vLLM compatible: {result.vllm_compatible}")