Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
a537511652 refactor: consolidate hardware optimizer with quant selector (#92)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 17s
2026-04-20 20:38:56 -04:00
5 changed files with 49 additions and 895 deletions

View File

@@ -1,5 +1,29 @@
"""Phase 19: Hardware-Aware Inference Optimization.
Part of the TurboQuant suite for local inference excellence.
"""Backward-compatible shim for hardware-aware quantization selection.
The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
logic. The canonical implementation now lives in `evolution.quant_selector`.
This shim preserves the legacy import path for any downstream callers while
making `quant_selector.py` the single source of truth.
"""
import logging
# ... (rest of the code)
from evolution.quant_selector import ( # noqa: F401
HardwareInfo,
QuantLevel,
QuantSelection,
QUANT_LEVELS,
detect_hardware,
estimate_kv_cache_gb,
estimate_model_memory_gb,
select_quant_level,
)
__all__ = [
"HardwareInfo",
"QuantLevel",
"QuantSelection",
"QUANT_LEVELS",
"detect_hardware",
"estimate_kv_cache_gb",
"estimate_model_memory_gb",
"select_quant_level",
]

View File

@@ -1,311 +0,0 @@
#!/usr/bin/env python3
"""Tests for turboquant/auto_select.py — preset selection logic."""
import json
import os
import sys
import tempfile
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from turboquant.auto_select import (
select_preset,
auto_select,
PRESETS,
QUALITY_ORDER,
SelectionResult,
SystemInfo,
_load_config,
save_config,
format_env_commands,
)
# ── Preset Selection ──────────────────────────────────────────────────────────
class TestSelectPreset:
"""Test preset selection logic."""
def test_high_overhead_best_quality(self):
"""8+ GB overhead -> turboquant_k8v4 (best)."""
result = select_preset(available_gb=20, model_size_gb=10)
assert result.preset == "turboquant_k8v4"
assert result.quality == "best"
assert result.compression_ratio == 2.6
assert result.vllm_compatible is True
def test_medium_overhead_good_quality(self):
"""4-8 GB overhead -> turboquant_4bit_nc (good)."""
result = select_preset(available_gb=12, model_size_gb=6)
assert result.preset == "turboquant_4bit_nc"
assert result.quality == "good"
assert result.compression_ratio == 3.8
def test_low_overhead_usable_quality(self):
"""2-4 GB overhead -> turboquant_3bit_nc (usable)."""
result = select_preset(available_gb=8, model_size_gb=5)
assert result.preset == "turboquant_3bit_nc"
assert result.quality == "usable"
assert result.compression_ratio == 4.9
def test_minimal_overhead_fallback(self):
"""<2 GB overhead -> q4_0 (basic fallback)."""
result = select_preset(available_gb=5, model_size_gb=4)
assert result.preset == "q4_0"
assert result.quality == "basic"
assert result.vllm_compatible is False
def test_negative_overhead_fallback(self):
"""Negative overhead (not enough memory) -> q4_0."""
result = select_preset(available_gb=3, model_size_gb=10)
assert result.preset == "q4_0"
assert result.overhead_gb < 0
assert len(result.warnings) > 0
assert "OVERCOMMITTED" in result.warnings[0]
def test_exact_threshold(self):
"""Exactly at threshold selects that preset."""
result = select_preset(available_gb=12, model_size_gb=4) # 8 GB overhead
assert result.preset == "turboquant_k8v4"
def test_just_below_threshold(self):
"""Just below threshold selects next tier."""
result = select_preset(available_gb=11.9, model_size_gb=4) # 7.9 GB overhead
assert result.preset == "turboquant_4bit_nc"
def test_zero_model_size(self):
"""Zero model size -> all overhead, best preset."""
result = select_preset(available_gb=16, model_size_gb=0)
assert result.preset == "turboquant_k8v4"
# ── vLLM Requirement ─────────────────────────────────────────────────────────
class TestVLLMRequirement:
"""Test vLLM compatibility filtering."""
def test_require_vllm_skips_fallback(self):
"""require_vllm should skip q4_0 (not vLLM compatible)."""
# Very low overhead — without vLLM requirement would pick q4_0
result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
# q4_0 is not vLLM compatible; with 1 GB overhead nothing else fits
# so it falls through to q4_0 as final fallback
assert result.preset in QUALITY_ORDER
def test_require_vllm_high_overhead(self):
"""With high overhead, vLLM-compatible preset selected."""
result = select_preset(available_gb=20, model_size_gb=10, require_vllm=True)
assert result.vllm_compatible is True
# ── SelectionResult ───────────────────────────────────────────────────────────
class TestSelectionResult:
"""Test SelectionResult dataclass."""
def test_to_dict(self):
result = select_preset(available_gb=20, model_size_gb=10)
d = result.to_dict()
assert d["preset"] == "turboquant_k8v4"
assert d["quality"] == "best"
assert "env_vars" in d
assert "server_flags" in d
assert "TURBO_LAYER_ADAPTIVE" in d["env_vars"]
assert "-ctk" in d["server_flags"]
def test_env_vars_populated(self):
result = select_preset(available_gb=20, model_size_gb=10)
assert "TURBO_LAYER_ADAPTIVE" in result.env_vars
assert result.env_vars["TURBO_LAYER_ADAPTIVE"] == "7"
def test_server_flags_populated(self):
result = select_preset(available_gb=20, model_size_gb=10)
assert "-ctk" in result.server_flags
assert "-ctv" in result.server_flags
# ── Preset Definitions ────────────────────────────────────────────────────────
class TestPresets:
"""Test preset definitions."""
def test_all_presets_have_required_fields(self):
for name, preset in PRESETS.items():
assert "name" in preset, f"{name} missing 'name'"
assert "description" in preset, f"{name} missing 'description'"
assert "min_overhead_gb" in preset, f"{name} missing 'min_overhead_gb'"
assert "compression_ratio" in preset, f"{name} missing 'compression_ratio'"
assert "quality" in preset, f"{name} missing 'quality'"
assert "vllm_compatible" in preset, f"{name} missing 'vllm_compatible'"
assert "kv_type" in preset, f"{name} missing 'kv_type'"
assert "layer_adaptive" in preset, f"{name} missing 'layer_adaptive'"
def test_quality_order_matches_presets(self):
for name in QUALITY_ORDER:
assert name in PRESETS, f"{name} in QUALITY_ORDER but not in PRESETS"
def test_all_presets_in_quality_order(self):
for name in PRESETS:
assert name in QUALITY_ORDER, f"{name} in PRESETS but not in QUALITY_ORDER"
def test_compression_increases_with_aggressiveness(self):
"""More aggressive presets should have higher compression."""
for i in range(len(QUALITY_ORDER) - 1):
current = PRESETS[QUALITY_ORDER[i]]
next_p = PRESETS[QUALITY_ORDER[i + 1]]
# TurboQuant presets should increase in compression
# q4_0 is a fallback and may break the pattern
if QUALITY_ORDER[i + 1] != "q4_0":
assert current["compression_ratio"] <= next_p["compression_ratio"], (
f"{QUALITY_ORDER[i]} ({current['compression_ratio']}x) should have "
f"<= compression than {QUALITY_ORDER[i+1]} ({next_p['compression_ratio']}x)"
)
def test_overhead_decreases_with_aggressiveness(self):
"""More aggressive presets should have lower overhead requirements."""
for i in range(len(QUALITY_ORDER) - 1):
current = PRESETS[QUALITY_ORDER[i]]
next_p = PRESETS[QUALITY_ORDER[i + 1]]
assert current["min_overhead_gb"] >= next_p["min_overhead_gb"], (
f"{QUALITY_ORDER[i]} (overhead={current['min_overhead_gb']}GB) should have "
f">= overhead than {QUALITY_ORDER[i+1]} (overhead={next_p['min_overhead_gb']}GB)"
)
# ── SystemInfo ────────────────────────────────────────────────────────────────
class TestSystemInfo:
"""Test system detection."""
def test_detect_returns_info(self):
info = SystemInfo.detect()
assert info.total_gb > 0
assert info.available_gb > 0
assert info.detection_method
def test_available_less_than_total(self):
info = SystemInfo.detect()
assert info.available_gb <= info.total_gb
# ── Config ────────────────────────────────────────────────────────────────────
class TestConfig:
"""Test config loading and saving."""
def test_load_config_missing_file(self, tmp_path):
config = _load_config(str(tmp_path / "nonexistent.json"))
assert config == {}
def test_load_config_valid_file(self, tmp_path):
config_file = tmp_path / "turboquant.json"
config_file.write_text(json.dumps({"preset_override": "turboquant_4bit_nc"}))
config = _load_config(str(config_file))
assert config["preset_override"] == "turboquant_4bit_nc"
def test_save_and_load_config(self, tmp_path):
config_file = tmp_path / "turboquant.json"
save_config({"preset_override": "turboquant_k8v4", "context_length": 32768},
str(config_file))
config = _load_config(str(config_file))
assert config["preset_override"] == "turboquant_k8v4"
assert config["context_length"] == 32768
def test_save_config_merges(self, tmp_path):
config_file = tmp_path / "turboquant.json"
config_file.write_text(json.dumps({"existing_key": "value"}))
save_config({"new_key": "new_value"}, str(config_file))
config = _load_config(str(config_file))
assert config["existing_key"] == "value"
assert config["new_key"] == "new_value"
# ── Auto-Select ───────────────────────────────────────────────────────────────
class TestAutoSelect:
"""Test auto_select with mocked system detection."""
def test_auto_select_with_override(self, tmp_path):
config_file = tmp_path / "turboquant.json"
result = auto_select(model_size_gb=14.0, config_override="turboquant_4bit_nc")
assert result.preset == "turboquant_4bit_nc"
assert "Config override" in result.reason
def test_auto_select_unknown_override_ignored(self):
result = auto_select(model_size_gb=14.0, config_override="nonexistent_preset")
# Should fall back to normal detection
assert result.preset in QUALITY_ORDER
def test_auto_select_config_override(self, tmp_path):
"""Config file preset_override is respected."""
config_file = tmp_path / "turboquant.json"
config_file.write_text(json.dumps({"preset_override": "turboquant_3bit_nc"}))
result = auto_select(model_size_gb=14.0, config_path=str(config_file))
assert result.preset == "turboquant_3bit_nc"
def test_auto_select_detects_system(self):
with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
mock_detect.return_value = SystemInfo(
total_gb=32, available_gb=24, is_apple_silicon=True,
detection_method="mock"
)
result = auto_select(model_size_gb=14.0)
assert result.system_info is not None
assert result.system_info.total_gb == 32
assert result.preset == "turboquant_k8v4" # 24 - 14 = 10 GB overhead
def test_auto_select_tight_memory(self):
with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
mock_detect.return_value = SystemInfo(
total_gb=16, available_gb=12, detection_method="mock"
)
result = auto_select(model_size_gb=10.0)
# 12 - 10 = 2 GB overhead -> turboquant_3bit_nc (min 2 GB)
assert result.preset == "turboquant_3bit_nc"
# ── Format ────────────────────────────────────────────────────────────────────
class TestFormat:
"""Test formatting utilities."""
def test_format_env_commands(self):
result = select_preset(available_gb=20, model_size_gb=10)
output = format_env_commands(result)
assert "export TURBO_LAYER_ADAPTIVE=7" in output
assert "turboquant_k8v4" in output
def test_format_env_commands_with_warnings(self):
result = select_preset(available_gb=5, model_size_gb=4)
output = format_env_commands(result)
assert "WARNING" in output
# ── Integration with issue spec ───────────────────────────────────────────────
class TestIssueSpec:
"""Verify implementation matches the exact logic from issue #97."""
def test_overhead_8gb_plus(self):
"""overhead >= 8 -> turboquant_k8v4"""
result = select_preset(available_gb=15, model_size_gb=7) # 8 GB overhead
assert result.preset == "turboquant_k8v4"
def test_overhead_4_to_8(self):
"""4 <= overhead < 8 -> turboquant_4bit_nc"""
result = select_preset(available_gb=11, model_size_gb=7) # 4 GB overhead
assert result.preset == "turboquant_4bit_nc"
def test_overhead_2_to_4(self):
"""2 <= overhead < 4 -> turboquant_3bit_nc"""
result = select_preset(available_gb=9, model_size_gb=7) # 2 GB overhead
assert result.preset == "turboquant_3bit_nc"
def test_overhead_under_2(self):
"""overhead < 2 -> q4_0"""
result = select_preset(available_gb=8, model_size_gb=7) # 1 GB overhead
assert result.preset == "q4_0"

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python3
"""Tests for hardware_optimizer compatibility shim."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from evolution import hardware_optimizer, quant_selector
def test_hardware_optimizer_reexports_quant_selector_api():
assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
def test_hardware_optimizer_exports_quant_level_definitions():
assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel

View File

@@ -1,23 +0,0 @@
"""TurboQuant — Auto-configuration and preset selection for KV cache compression."""
from turboquant.auto_select import (
auto_select,
select_preset,
list_presets,
get_preset_info,
SystemInfo,
SelectionResult,
PRESETS,
QUALITY_ORDER,
)
__all__ = [
"auto_select",
"select_preset",
"list_presets",
"get_preset_info",
"SystemInfo",
"SelectionResult",
"PRESETS",
"QUALITY_ORDER",
]

View File

@@ -1,557 +0,0 @@
#!/usr/bin/env python3
"""
TurboQuant Auto-Select — Choose optimal KV cache preset based on available memory.
Implements the config helper from issue #97: detects available memory and
selects the best TurboQuant preset for the deployment environment.
Presets map to deployment configurations (not raw quant levels):
turboquant_k8v4 — 8+ GB overhead, best quality (k/v asymmetric: 8-bit K, 4-bit V)
turboquant_4bit_nc — 4-8 GB overhead, good quality (4-bit, no calibration)
turboquant_3bit_nc — 2-4 GB overhead, usable quality (3-bit, no calibration)
q4_0 — <2 GB overhead, GGUF fallback (no vLLM)
Usage:
from turboquant.auto_select import auto_select
result = auto_select(model_size_gb=7.0)
print(f"Selected: {result.preset}")
CLI:
python3 turboquant/auto_select.py --model-size 7.0
python3 turboquant/auto_select.py --model-size 7.0 --json
python3 turboquant/auto_select.py --list
"""
import json
import logging
import os
import platform
import subprocess
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Optional
logger = logging.getLogger(__name__)
# ── Preset Definitions ────────────────────────────────────────────────────────
PRESETS: Dict[str, Dict[str, Any]] = {
"turboquant_k8v4": {
"name": "TurboQuant K8V4",
"description": "Best quality — asymmetric 8-bit K, 4-bit V",
"min_overhead_gb": 8,
"compression_ratio": 2.6,
"quality": "best",
"vllm_compatible": True,
"kv_type": "turbo4",
"layer_adaptive": 7,
},
"turboquant_4bit_nc": {
"name": "TurboQuant 4-bit NC",
"description": "Good quality — 4-bit, no calibration",
"min_overhead_gb": 4,
"compression_ratio": 3.8,
"quality": "good",
"vllm_compatible": True,
"kv_type": "turbo4",
"layer_adaptive": 5,
},
"turboquant_3bit_nc": {
"name": "TurboQuant 3-bit NC",
"description": "Usable quality — 3-bit, no calibration",
"min_overhead_gb": 2,
"compression_ratio": 4.9,
"quality": "usable",
"vllm_compatible": True,
"kv_type": "turbo3",
"layer_adaptive": 3,
},
"q4_0": {
"name": "Q4_0 GGUF",
"description": "GGUF fallback — no TurboQuant, no vLLM",
"min_overhead_gb": 0,
"compression_ratio": 4.0,
"quality": "basic",
"vllm_compatible": False,
"kv_type": "q4_0",
"layer_adaptive": 0,
},
}
# Ordered from best quality to most aggressive
QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
# ── Data Classes ──────────────────────────────────────────────────────────────
@dataclass
class SystemInfo:
"""Detected system resources."""
total_gb: float
available_gb: float
gpu_memory_gb: Optional[float] = None
gpu_name: Optional[str] = None
is_apple_silicon: bool = False
detection_method: str = ""
@classmethod
def detect(cls) -> "SystemInfo":
"""Detect available memory. Uses platform-appropriate methods."""
system = platform.system()
if system == "Darwin":
return cls._detect_macos()
elif system == "Linux":
return cls._detect_linux()
else:
return cls._detect_fallback(system)
@classmethod
def _detect_macos(cls) -> "SystemInfo":
"""Detect Apple Silicon unified memory."""
info = cls(total_gb=0, available_gb=0, is_apple_silicon=True, detection_method="sysctl")
try:
r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
if r.returncode == 0:
info.total_gb = int(r.stdout.strip()) / (1024 ** 3)
r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
if r.returncode == 0:
page_size = 4096
free_pages = 0
for line in r.stdout.split("\n"):
if "Pages free:" in line:
try:
free_pages = int(line.split(":")[-1].strip().rstrip("."))
except ValueError:
pass
info.available_gb = (free_pages * page_size) / (1024 ** 3)
if info.available_gb < 1:
info.available_gb = info.total_gb * 0.70
info.gpu_memory_gb = info.total_gb # Unified memory
r = subprocess.run(
["system_profiler", "SPDisplaysDataType"],
capture_output=True, text=True, timeout=10
)
if r.returncode == 0:
for line in r.stdout.split("\n"):
if "Chipset" in line or "GPU" in line:
info.gpu_name = line.split(":")[-1].strip()
break
except Exception as e:
logger.warning("macOS detection failed: %s", e)
info.total_gb = 16.0
info.available_gb = 12.0
info.detection_method = "fallback"
return info
@classmethod
def _detect_linux(cls) -> "SystemInfo":
"""Detect Linux with optional NVIDIA GPU."""
info = cls(total_gb=0, available_gb=0, detection_method="proc")
try:
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal:"):
info.total_gb = int(line.split()[1]) / (1024 * 1024)
elif line.startswith("MemAvailable:"):
info.available_gb = int(line.split()[1]) / (1024 * 1024)
try:
r = subprocess.run(
["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10
)
if r.returncode == 0 and r.stdout.strip():
parts = r.stdout.strip().split("\n")[0].split(", ")
if len(parts) >= 3:
info.gpu_name = parts[0].strip()
info.gpu_memory_gb = float(parts[1]) / 1024
info.available_gb = max(info.available_gb, float(parts[2]) / 1024)
info.detection_method = "nvidia-smi"
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
except Exception as e:
logger.warning("Linux detection failed: %s", e)
info.total_gb = 16.0
info.available_gb = 12.0
info.detection_method = "fallback"
return info
@classmethod
def _detect_fallback(cls, system: str) -> "SystemInfo":
"""Fallback for unknown systems. Tries psutil."""
try:
import psutil
mem = psutil.virtual_memory()
return cls(
total_gb=round(mem.total / (1024 ** 3), 1),
available_gb=round(mem.available / (1024 ** 3), 1),
detection_method="psutil",
)
except ImportError:
return cls(total_gb=16.0, available_gb=12.0, detection_method="fallback")
@dataclass
class SelectionResult:
"""Result of preset selection."""
preset: str
reason: str
overhead_gb: float
quality: str
compression_ratio: float
vllm_compatible: bool
kv_type: str
layer_adaptive: int
env_vars: Dict[str, str] = field(default_factory=dict)
server_flags: Dict[str, str] = field(default_factory=dict)
warnings: list = field(default_factory=list)
system_info: Optional[SystemInfo] = None
def to_dict(self) -> dict:
d = {
"preset": self.preset,
"reason": self.reason,
"overhead_gb": round(self.overhead_gb, 2),
"quality": self.quality,
"compression_ratio": self.compression_ratio,
"vllm_compatible": self.vllm_compatible,
"kv_type": self.kv_type,
"layer_adaptive": self.layer_adaptive,
"env_vars": self.env_vars,
"server_flags": self.server_flags,
"warnings": self.warnings,
}
if self.system_info:
d["system"] = {
"total_gb": round(self.system_info.total_gb, 1),
"available_gb": round(self.system_info.available_gb, 1),
"gpu_name": self.system_info.gpu_name,
"is_apple_silicon": self.system_info.is_apple_silicon,
"detection_method": self.system_info.detection_method,
}
return d
# ── Selection Logic ───────────────────────────────────────────────────────────
def select_preset(
available_gb: float,
model_size_gb: float,
require_vllm: bool = False,
) -> SelectionResult:
"""Select the best TurboQuant preset based on memory overhead.
Args:
available_gb: Available system memory in GB
model_size_gb: Model size in GB
require_vllm: If True, only select vLLM-compatible presets
Returns:
SelectionResult with chosen preset and reasoning
"""
overhead_gb = available_gb - model_size_gb
if overhead_gb < 0:
logger.warning(
"Insufficient memory: need %.1f GB, have %.1f GB available",
model_size_gb, available_gb
)
return _make_result("q4_0", overhead_gb,
reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback")
# Walk quality order (best first), pick first that fits
for preset_name in QUALITY_ORDER:
preset = PRESETS[preset_name]
if require_vllm and not preset["vllm_compatible"]:
continue
if overhead_gb >= preset["min_overhead_gb"]:
reason = (
f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB "
f"required for {preset['name']}"
)
return _make_result(preset_name, overhead_gb, reason=reason)
# Nothing fits — aggressive fallback
return _make_result("q4_0", overhead_gb,
reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback")
def _make_result(preset_name: str, overhead_gb: float, reason: str) -> SelectionResult:
"""Build a SelectionResult from a preset name."""
p = PRESETS[preset_name]
env_vars = {"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])}
server_flags = {"-ctk": p["kv_type"], "-ctv": p["kv_type"]}
warnings = []
if overhead_gb < 2.0 and overhead_gb >= 0:
warnings.append(f"Low headroom ({overhead_gb:.1f} GB). Consider reducing context length.")
if overhead_gb < 0:
warnings.append(
f"OVERCOMMITTED: model needs {abs(overhead_gb):.1f} GB more than available. "
f"Inference may fail or swap heavily."
)
return SelectionResult(
preset=preset_name,
reason=reason,
overhead_gb=overhead_gb,
quality=p["quality"],
compression_ratio=p["compression_ratio"],
vllm_compatible=p["vllm_compatible"],
kv_type=p["kv_type"],
layer_adaptive=p["layer_adaptive"],
env_vars=env_vars,
server_flags=server_flags,
warnings=warnings,
)
# ── Auto-Select (with system detection + config) ─────────────────────────────
def auto_select(
model_size_gb: float,
config_override: Optional[str] = None,
require_vllm: bool = False,
config_path: Optional[str] = None,
) -> SelectionResult:
"""Auto-select preset based on system detection.
Args:
model_size_gb: Model size in GB
config_override: Force a specific preset (skips detection)
require_vllm: Require vLLM compatibility
config_path: Path to config file (default: $HERMES_HOME/turboquant.json)
Returns:
SelectionResult
"""
# Load config file if exists
config = _load_config(config_path)
# Config override takes precedence
override = config_override or config.get("preset_override")
if override:
if override in PRESETS:
logger.info("Using config override: %s", override)
p = PRESETS[override]
return SelectionResult(
preset=override,
reason=f"Config override: {p['name']}",
overhead_gb=0,
quality=p["quality"],
compression_ratio=p["compression_ratio"],
vllm_compatible=p["vllm_compatible"],
kv_type=p["kv_type"],
layer_adaptive=p["layer_adaptive"],
env_vars={"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])},
server_flags={"-ctk": p["kv_type"], "-ctv": p["kv_type"]},
)
else:
logger.warning("Unknown preset override '%s', falling back to auto-select", override)
# Detect system
sys_info = SystemInfo.detect()
logger.info(
"System: %.1f GB total, %.1f GB available, model: %.1f GB",
sys_info.total_gb, sys_info.available_gb, model_size_gb
)
# Select
result = select_preset(
available_gb=sys_info.available_gb,
model_size_gb=model_size_gb,
require_vllm=require_vllm,
)
result.system_info = sys_info
# Apply context length from config
context_length = config.get("context_length")
if context_length:
result.server_flags["-c"] = str(context_length)
return result
def _load_config(config_path: Optional[str] = None) -> dict:
"""Load turboquant config from JSON file."""
if config_path:
p = Path(config_path)
else:
try:
from hermes_constants import get_hermes_home
p = get_hermes_home() / "turboquant.json"
except (ImportError, Exception):
p = Path.home() / ".hermes" / "turboquant.json"
if p.exists():
try:
return json.loads(p.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
return {}
def save_config(values: dict, config_path: Optional[str] = None) -> None:
"""Save turboquant config to JSON file."""
if config_path:
p = Path(config_path)
else:
try:
from hermes_constants import get_hermes_home
p = get_hermes_home() / "turboquant.json"
except (ImportError, Exception):
p = Path.home() / ".hermes" / "turboquant.json"
p.parent.mkdir(parents=True, exist_ok=True)
existing = {}
if p.exists():
try:
existing = json.loads(p.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
pass
existing.update(values)
p.write_text(json.dumps(existing, indent=2), encoding="utf-8")
logger.info("Config saved to %s", p)
# ── Utility ───────────────────────────────────────────────────────────────────
def get_preset_info(preset_name: str) -> Optional[dict]:
"""Get information about a preset."""
return PRESETS.get(preset_name)
def list_presets() -> dict:
"""List all available presets."""
return PRESETS.copy()
def format_env_commands(result: SelectionResult) -> str:
"""Format result as shell export commands."""
lines = [f"# TurboQuant preset: {result.preset} ({result.quality}, {result.compression_ratio}x)"]
lines.append(f"# {result.reason}")
for k, v in result.env_vars.items():
lines.append(f"export {k}={v}")
flags = " ".join(f"{k} {v}" for k, v in result.server_flags.items())
if flags:
lines.append(f"# llama-server flags: {flags}")
for w in result.warnings:
lines.append(f"# WARNING: {w}")
return "\n".join(lines)
# ── CLI ───────────────────────────────────────────────────────────────────────
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(
description="Auto-select TurboQuant preset based on available memory",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --model-size 14.0 # Auto-detect and select
%(prog)s --model-size 7.0 --json # JSON output
%(prog)s --model-size 7.0 --list # List available presets
%(prog)s --model-size 7.0 --require-vllm
%(prog)s --model-size 7.0 --preset turboquant_4bit_nc # Override
""",
)
parser.add_argument("--model-size", type=float, help="Model size in GB")
parser.add_argument("--preset", help="Force a specific preset (skip detection)")
parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
parser.add_argument("--json", action="store_true", help="JSON output")
parser.add_argument("--list", action="store_true", help="List available presets")
parser.add_argument("--shell", action="store_true", help="Output as shell export commands")
parser.add_argument("--detect-only", action="store_true", help="Only detect hardware")
parser.add_argument("--config", help="Path to config file")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(message)s")
if args.list:
print("Available presets:")
print(f" {'Name':22} {'Quality':8} {'Compress':8} {'vLLM':5} {'Overhead':8} Description")
for name in QUALITY_ORDER:
p = PRESETS[name]
vllm = "yes" if p["vllm_compatible"] else "no"
print(
f" {name:22} {p['quality']:8} {p['compression_ratio']:.1f}x "
f"{vllm:5} >= {p['min_overhead_gb']:2.0f} GB {p['description']}"
)
return
if args.detect_only:
info = SystemInfo.detect()
if args.json:
print(json.dumps({
"total_gb": round(info.total_gb, 1),
"available_gb": round(info.available_gb, 1),
"gpu_memory_gb": round(info.gpu_memory_gb, 1) if info.gpu_memory_gb else None,
"gpu_name": info.gpu_name,
"is_apple_silicon": info.is_apple_silicon,
"detection_method": info.detection_method,
}, indent=2))
else:
print(f"Total memory: {info.total_gb:.1f} GB")
print(f"Available: {info.available_gb:.1f} GB")
if info.gpu_memory_gb:
print(f"GPU memory: {info.gpu_memory_gb:.1f} GB")
if info.gpu_name:
print(f"GPU: {info.gpu_name}")
if info.is_apple_silicon:
print(f"Chip: Apple Silicon")
print(f"Detection: {info.detection_method}")
return
if not args.model_size:
parser.error("--model-size is required (unless using --list or --detect-only)")
result = auto_select(
model_size_gb=args.model_size,
config_override=args.preset,
require_vllm=args.require_vllm,
config_path=args.config,
)
if args.json:
print(json.dumps(result.to_dict(), indent=2))
elif args.shell:
print(format_env_commands(result))
else:
print(f"Selected: {result.preset} ({result.quality}, {result.compression_ratio}x)")
print(f" {result.reason}")
if result.system_info:
print(f" System: {result.system_info.total_gb:.0f} GB total, {result.system_info.available_gb:.0f} GB available")
print(f" Overhead: {result.overhead_gb:.1f} GB")
print()
print(f"Environment:")
for k, v in result.env_vars.items():
print(f" export {k}={v}")
print(f"Server flags:")
for k, v in result.server_flags.items():
print(f" {k} {v}")
if result.warnings:
print()
for w in result.warnings:
print(f" WARNING: {w}")
if __name__ == "__main__":
main()