Compare commits
1 Commits
burn/97-17
...
fix/92-har
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a537511652 |
@@ -1,5 +1,29 @@
|
||||
"""Phase 19: Hardware-Aware Inference Optimization.
|
||||
Part of the TurboQuant suite for local inference excellence.
|
||||
"""Backward-compatible shim for hardware-aware quantization selection.
|
||||
|
||||
The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
|
||||
logic. The canonical implementation now lives in `evolution.quant_selector`.
|
||||
This shim preserves the legacy import path for any downstream callers while
|
||||
making `quant_selector.py` the single source of truth.
|
||||
"""
|
||||
import logging
|
||||
# ... (rest of the code)
|
||||
|
||||
from evolution.quant_selector import ( # noqa: F401
|
||||
HardwareInfo,
|
||||
QuantLevel,
|
||||
QuantSelection,
|
||||
QUANT_LEVELS,
|
||||
detect_hardware,
|
||||
estimate_kv_cache_gb,
|
||||
estimate_model_memory_gb,
|
||||
select_quant_level,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"HardwareInfo",
|
||||
"QuantLevel",
|
||||
"QuantSelection",
|
||||
"QUANT_LEVELS",
|
||||
"detect_hardware",
|
||||
"estimate_kv_cache_gb",
|
||||
"estimate_model_memory_gb",
|
||||
"select_quant_level",
|
||||
]
|
||||
|
||||
@@ -1,311 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for turboquant/auto_select.py — preset selection logic."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from turboquant.auto_select import (
|
||||
select_preset,
|
||||
auto_select,
|
||||
PRESETS,
|
||||
QUALITY_ORDER,
|
||||
SelectionResult,
|
||||
SystemInfo,
|
||||
_load_config,
|
||||
save_config,
|
||||
format_env_commands,
|
||||
)
|
||||
|
||||
|
||||
# ── Preset Selection ──────────────────────────────────────────────────────────
|
||||
|
||||
class TestSelectPreset:
|
||||
"""Test preset selection logic."""
|
||||
|
||||
def test_high_overhead_best_quality(self):
|
||||
"""8+ GB overhead -> turboquant_k8v4 (best)."""
|
||||
result = select_preset(available_gb=20, model_size_gb=10)
|
||||
assert result.preset == "turboquant_k8v4"
|
||||
assert result.quality == "best"
|
||||
assert result.compression_ratio == 2.6
|
||||
assert result.vllm_compatible is True
|
||||
|
||||
def test_medium_overhead_good_quality(self):
|
||||
"""4-8 GB overhead -> turboquant_4bit_nc (good)."""
|
||||
result = select_preset(available_gb=12, model_size_gb=6)
|
||||
assert result.preset == "turboquant_4bit_nc"
|
||||
assert result.quality == "good"
|
||||
assert result.compression_ratio == 3.8
|
||||
|
||||
def test_low_overhead_usable_quality(self):
|
||||
"""2-4 GB overhead -> turboquant_3bit_nc (usable)."""
|
||||
result = select_preset(available_gb=8, model_size_gb=5)
|
||||
assert result.preset == "turboquant_3bit_nc"
|
||||
assert result.quality == "usable"
|
||||
assert result.compression_ratio == 4.9
|
||||
|
||||
def test_minimal_overhead_fallback(self):
|
||||
"""<2 GB overhead -> q4_0 (basic fallback)."""
|
||||
result = select_preset(available_gb=5, model_size_gb=4)
|
||||
assert result.preset == "q4_0"
|
||||
assert result.quality == "basic"
|
||||
assert result.vllm_compatible is False
|
||||
|
||||
def test_negative_overhead_fallback(self):
|
||||
"""Negative overhead (not enough memory) -> q4_0."""
|
||||
result = select_preset(available_gb=3, model_size_gb=10)
|
||||
assert result.preset == "q4_0"
|
||||
assert result.overhead_gb < 0
|
||||
assert len(result.warnings) > 0
|
||||
assert "OVERCOMMITTED" in result.warnings[0]
|
||||
|
||||
def test_exact_threshold(self):
|
||||
"""Exactly at threshold selects that preset."""
|
||||
result = select_preset(available_gb=12, model_size_gb=4) # 8 GB overhead
|
||||
assert result.preset == "turboquant_k8v4"
|
||||
|
||||
def test_just_below_threshold(self):
|
||||
"""Just below threshold selects next tier."""
|
||||
result = select_preset(available_gb=11.9, model_size_gb=4) # 7.9 GB overhead
|
||||
assert result.preset == "turboquant_4bit_nc"
|
||||
|
||||
def test_zero_model_size(self):
|
||||
"""Zero model size -> all overhead, best preset."""
|
||||
result = select_preset(available_gb=16, model_size_gb=0)
|
||||
assert result.preset == "turboquant_k8v4"
|
||||
|
||||
|
||||
# ── vLLM Requirement ─────────────────────────────────────────────────────────
|
||||
|
||||
class TestVLLMRequirement:
|
||||
"""Test vLLM compatibility filtering."""
|
||||
|
||||
def test_require_vllm_skips_fallback(self):
|
||||
"""require_vllm should skip q4_0 (not vLLM compatible)."""
|
||||
# Very low overhead — without vLLM requirement would pick q4_0
|
||||
result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
|
||||
# q4_0 is not vLLM compatible; with 1 GB overhead nothing else fits
|
||||
# so it falls through to q4_0 as final fallback
|
||||
assert result.preset in QUALITY_ORDER
|
||||
|
||||
def test_require_vllm_high_overhead(self):
|
||||
"""With high overhead, vLLM-compatible preset selected."""
|
||||
result = select_preset(available_gb=20, model_size_gb=10, require_vllm=True)
|
||||
assert result.vllm_compatible is True
|
||||
|
||||
|
||||
# ── SelectionResult ───────────────────────────────────────────────────────────
|
||||
|
||||
class TestSelectionResult:
|
||||
"""Test SelectionResult dataclass."""
|
||||
|
||||
def test_to_dict(self):
|
||||
result = select_preset(available_gb=20, model_size_gb=10)
|
||||
d = result.to_dict()
|
||||
assert d["preset"] == "turboquant_k8v4"
|
||||
assert d["quality"] == "best"
|
||||
assert "env_vars" in d
|
||||
assert "server_flags" in d
|
||||
assert "TURBO_LAYER_ADAPTIVE" in d["env_vars"]
|
||||
assert "-ctk" in d["server_flags"]
|
||||
|
||||
def test_env_vars_populated(self):
|
||||
result = select_preset(available_gb=20, model_size_gb=10)
|
||||
assert "TURBO_LAYER_ADAPTIVE" in result.env_vars
|
||||
assert result.env_vars["TURBO_LAYER_ADAPTIVE"] == "7"
|
||||
|
||||
def test_server_flags_populated(self):
|
||||
result = select_preset(available_gb=20, model_size_gb=10)
|
||||
assert "-ctk" in result.server_flags
|
||||
assert "-ctv" in result.server_flags
|
||||
|
||||
|
||||
# ── Preset Definitions ────────────────────────────────────────────────────────
|
||||
|
||||
class TestPresets:
|
||||
"""Test preset definitions."""
|
||||
|
||||
def test_all_presets_have_required_fields(self):
|
||||
for name, preset in PRESETS.items():
|
||||
assert "name" in preset, f"{name} missing 'name'"
|
||||
assert "description" in preset, f"{name} missing 'description'"
|
||||
assert "min_overhead_gb" in preset, f"{name} missing 'min_overhead_gb'"
|
||||
assert "compression_ratio" in preset, f"{name} missing 'compression_ratio'"
|
||||
assert "quality" in preset, f"{name} missing 'quality'"
|
||||
assert "vllm_compatible" in preset, f"{name} missing 'vllm_compatible'"
|
||||
assert "kv_type" in preset, f"{name} missing 'kv_type'"
|
||||
assert "layer_adaptive" in preset, f"{name} missing 'layer_adaptive'"
|
||||
|
||||
def test_quality_order_matches_presets(self):
|
||||
for name in QUALITY_ORDER:
|
||||
assert name in PRESETS, f"{name} in QUALITY_ORDER but not in PRESETS"
|
||||
|
||||
def test_all_presets_in_quality_order(self):
|
||||
for name in PRESETS:
|
||||
assert name in QUALITY_ORDER, f"{name} in PRESETS but not in QUALITY_ORDER"
|
||||
|
||||
def test_compression_increases_with_aggressiveness(self):
|
||||
"""More aggressive presets should have higher compression."""
|
||||
for i in range(len(QUALITY_ORDER) - 1):
|
||||
current = PRESETS[QUALITY_ORDER[i]]
|
||||
next_p = PRESETS[QUALITY_ORDER[i + 1]]
|
||||
# TurboQuant presets should increase in compression
|
||||
# q4_0 is a fallback and may break the pattern
|
||||
if QUALITY_ORDER[i + 1] != "q4_0":
|
||||
assert current["compression_ratio"] <= next_p["compression_ratio"], (
|
||||
f"{QUALITY_ORDER[i]} ({current['compression_ratio']}x) should have "
|
||||
f"<= compression than {QUALITY_ORDER[i+1]} ({next_p['compression_ratio']}x)"
|
||||
)
|
||||
|
||||
def test_overhead_decreases_with_aggressiveness(self):
|
||||
"""More aggressive presets should have lower overhead requirements."""
|
||||
for i in range(len(QUALITY_ORDER) - 1):
|
||||
current = PRESETS[QUALITY_ORDER[i]]
|
||||
next_p = PRESETS[QUALITY_ORDER[i + 1]]
|
||||
assert current["min_overhead_gb"] >= next_p["min_overhead_gb"], (
|
||||
f"{QUALITY_ORDER[i]} (overhead={current['min_overhead_gb']}GB) should have "
|
||||
f">= overhead than {QUALITY_ORDER[i+1]} (overhead={next_p['min_overhead_gb']}GB)"
|
||||
)
|
||||
|
||||
|
||||
# ── SystemInfo ────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSystemInfo:
|
||||
"""Test system detection."""
|
||||
|
||||
def test_detect_returns_info(self):
|
||||
info = SystemInfo.detect()
|
||||
assert info.total_gb > 0
|
||||
assert info.available_gb > 0
|
||||
assert info.detection_method
|
||||
|
||||
def test_available_less_than_total(self):
|
||||
info = SystemInfo.detect()
|
||||
assert info.available_gb <= info.total_gb
|
||||
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestConfig:
|
||||
"""Test config loading and saving."""
|
||||
|
||||
def test_load_config_missing_file(self, tmp_path):
|
||||
config = _load_config(str(tmp_path / "nonexistent.json"))
|
||||
assert config == {}
|
||||
|
||||
def test_load_config_valid_file(self, tmp_path):
|
||||
config_file = tmp_path / "turboquant.json"
|
||||
config_file.write_text(json.dumps({"preset_override": "turboquant_4bit_nc"}))
|
||||
config = _load_config(str(config_file))
|
||||
assert config["preset_override"] == "turboquant_4bit_nc"
|
||||
|
||||
def test_save_and_load_config(self, tmp_path):
|
||||
config_file = tmp_path / "turboquant.json"
|
||||
save_config({"preset_override": "turboquant_k8v4", "context_length": 32768},
|
||||
str(config_file))
|
||||
config = _load_config(str(config_file))
|
||||
assert config["preset_override"] == "turboquant_k8v4"
|
||||
assert config["context_length"] == 32768
|
||||
|
||||
def test_save_config_merges(self, tmp_path):
|
||||
config_file = tmp_path / "turboquant.json"
|
||||
config_file.write_text(json.dumps({"existing_key": "value"}))
|
||||
save_config({"new_key": "new_value"}, str(config_file))
|
||||
config = _load_config(str(config_file))
|
||||
assert config["existing_key"] == "value"
|
||||
assert config["new_key"] == "new_value"
|
||||
|
||||
|
||||
# ── Auto-Select ───────────────────────────────────────────────────────────────
|
||||
|
||||
class TestAutoSelect:
|
||||
"""Test auto_select with mocked system detection."""
|
||||
|
||||
def test_auto_select_with_override(self, tmp_path):
|
||||
config_file = tmp_path / "turboquant.json"
|
||||
result = auto_select(model_size_gb=14.0, config_override="turboquant_4bit_nc")
|
||||
assert result.preset == "turboquant_4bit_nc"
|
||||
assert "Config override" in result.reason
|
||||
|
||||
def test_auto_select_unknown_override_ignored(self):
|
||||
result = auto_select(model_size_gb=14.0, config_override="nonexistent_preset")
|
||||
# Should fall back to normal detection
|
||||
assert result.preset in QUALITY_ORDER
|
||||
|
||||
def test_auto_select_config_override(self, tmp_path):
|
||||
"""Config file preset_override is respected."""
|
||||
config_file = tmp_path / "turboquant.json"
|
||||
config_file.write_text(json.dumps({"preset_override": "turboquant_3bit_nc"}))
|
||||
result = auto_select(model_size_gb=14.0, config_path=str(config_file))
|
||||
assert result.preset == "turboquant_3bit_nc"
|
||||
|
||||
def test_auto_select_detects_system(self):
|
||||
with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
|
||||
mock_detect.return_value = SystemInfo(
|
||||
total_gb=32, available_gb=24, is_apple_silicon=True,
|
||||
detection_method="mock"
|
||||
)
|
||||
result = auto_select(model_size_gb=14.0)
|
||||
assert result.system_info is not None
|
||||
assert result.system_info.total_gb == 32
|
||||
assert result.preset == "turboquant_k8v4" # 24 - 14 = 10 GB overhead
|
||||
|
||||
def test_auto_select_tight_memory(self):
|
||||
with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
|
||||
mock_detect.return_value = SystemInfo(
|
||||
total_gb=16, available_gb=12, detection_method="mock"
|
||||
)
|
||||
result = auto_select(model_size_gb=10.0)
|
||||
# 12 - 10 = 2 GB overhead -> turboquant_3bit_nc (min 2 GB)
|
||||
assert result.preset == "turboquant_3bit_nc"
|
||||
|
||||
|
||||
# ── Format ────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestFormat:
|
||||
"""Test formatting utilities."""
|
||||
|
||||
def test_format_env_commands(self):
|
||||
result = select_preset(available_gb=20, model_size_gb=10)
|
||||
output = format_env_commands(result)
|
||||
assert "export TURBO_LAYER_ADAPTIVE=7" in output
|
||||
assert "turboquant_k8v4" in output
|
||||
|
||||
def test_format_env_commands_with_warnings(self):
|
||||
result = select_preset(available_gb=5, model_size_gb=4)
|
||||
output = format_env_commands(result)
|
||||
assert "WARNING" in output
|
||||
|
||||
|
||||
# ── Integration with issue spec ───────────────────────────────────────────────
|
||||
|
||||
class TestIssueSpec:
|
||||
"""Verify implementation matches the exact logic from issue #97."""
|
||||
|
||||
def test_overhead_8gb_plus(self):
|
||||
"""overhead >= 8 -> turboquant_k8v4"""
|
||||
result = select_preset(available_gb=15, model_size_gb=7) # 8 GB overhead
|
||||
assert result.preset == "turboquant_k8v4"
|
||||
|
||||
def test_overhead_4_to_8(self):
|
||||
"""4 <= overhead < 8 -> turboquant_4bit_nc"""
|
||||
result = select_preset(available_gb=11, model_size_gb=7) # 4 GB overhead
|
||||
assert result.preset == "turboquant_4bit_nc"
|
||||
|
||||
def test_overhead_2_to_4(self):
|
||||
"""2 <= overhead < 4 -> turboquant_3bit_nc"""
|
||||
result = select_preset(available_gb=9, model_size_gb=7) # 2 GB overhead
|
||||
assert result.preset == "turboquant_3bit_nc"
|
||||
|
||||
def test_overhead_under_2(self):
|
||||
"""overhead < 2 -> q4_0"""
|
||||
result = select_preset(available_gb=8, model_size_gb=7) # 1 GB overhead
|
||||
assert result.preset == "q4_0"
|
||||
21
tests/test_hardware_optimizer.py
Normal file
21
tests/test_hardware_optimizer.py
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for hardware_optimizer compatibility shim."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from evolution import hardware_optimizer, quant_selector
|
||||
|
||||
|
||||
def test_hardware_optimizer_reexports_quant_selector_api():
|
||||
assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
|
||||
assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
|
||||
assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
|
||||
assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
|
||||
|
||||
|
||||
def test_hardware_optimizer_exports_quant_level_definitions():
|
||||
assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
|
||||
assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel
|
||||
@@ -1,23 +0,0 @@
|
||||
"""TurboQuant — Auto-configuration and preset selection for KV cache compression."""
|
||||
|
||||
from turboquant.auto_select import (
|
||||
auto_select,
|
||||
select_preset,
|
||||
list_presets,
|
||||
get_preset_info,
|
||||
SystemInfo,
|
||||
SelectionResult,
|
||||
PRESETS,
|
||||
QUALITY_ORDER,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"auto_select",
|
||||
"select_preset",
|
||||
"list_presets",
|
||||
"get_preset_info",
|
||||
"SystemInfo",
|
||||
"SelectionResult",
|
||||
"PRESETS",
|
||||
"QUALITY_ORDER",
|
||||
]
|
||||
@@ -1,557 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
TurboQuant Auto-Select — Choose optimal KV cache preset based on available memory.
|
||||
|
||||
Implements the config helper from issue #97: detects available memory and
|
||||
selects the best TurboQuant preset for the deployment environment.
|
||||
|
||||
Presets map to deployment configurations (not raw quant levels):
|
||||
turboquant_k8v4 — 8+ GB overhead, best quality (k/v asymmetric: 8-bit K, 4-bit V)
|
||||
turboquant_4bit_nc — 4-8 GB overhead, good quality (4-bit, no calibration)
|
||||
turboquant_3bit_nc — 2-4 GB overhead, usable quality (3-bit, no calibration)
|
||||
q4_0 — <2 GB overhead, GGUF fallback (no vLLM)
|
||||
|
||||
Usage:
|
||||
from turboquant.auto_select import auto_select
|
||||
|
||||
result = auto_select(model_size_gb=7.0)
|
||||
print(f"Selected: {result.preset}")
|
||||
|
||||
CLI:
|
||||
python3 turboquant/auto_select.py --model-size 7.0
|
||||
python3 turboquant/auto_select.py --model-size 7.0 --json
|
||||
python3 turboquant/auto_select.py --list
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Preset Definitions ────────────────────────────────────────────────────────
|
||||
|
||||
PRESETS: Dict[str, Dict[str, Any]] = {
|
||||
"turboquant_k8v4": {
|
||||
"name": "TurboQuant K8V4",
|
||||
"description": "Best quality — asymmetric 8-bit K, 4-bit V",
|
||||
"min_overhead_gb": 8,
|
||||
"compression_ratio": 2.6,
|
||||
"quality": "best",
|
||||
"vllm_compatible": True,
|
||||
"kv_type": "turbo4",
|
||||
"layer_adaptive": 7,
|
||||
},
|
||||
"turboquant_4bit_nc": {
|
||||
"name": "TurboQuant 4-bit NC",
|
||||
"description": "Good quality — 4-bit, no calibration",
|
||||
"min_overhead_gb": 4,
|
||||
"compression_ratio": 3.8,
|
||||
"quality": "good",
|
||||
"vllm_compatible": True,
|
||||
"kv_type": "turbo4",
|
||||
"layer_adaptive": 5,
|
||||
},
|
||||
"turboquant_3bit_nc": {
|
||||
"name": "TurboQuant 3-bit NC",
|
||||
"description": "Usable quality — 3-bit, no calibration",
|
||||
"min_overhead_gb": 2,
|
||||
"compression_ratio": 4.9,
|
||||
"quality": "usable",
|
||||
"vllm_compatible": True,
|
||||
"kv_type": "turbo3",
|
||||
"layer_adaptive": 3,
|
||||
},
|
||||
"q4_0": {
|
||||
"name": "Q4_0 GGUF",
|
||||
"description": "GGUF fallback — no TurboQuant, no vLLM",
|
||||
"min_overhead_gb": 0,
|
||||
"compression_ratio": 4.0,
|
||||
"quality": "basic",
|
||||
"vllm_compatible": False,
|
||||
"kv_type": "q4_0",
|
||||
"layer_adaptive": 0,
|
||||
},
|
||||
}
|
||||
|
||||
# Ordered from best quality to most aggressive
|
||||
QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
|
||||
|
||||
|
||||
# ── Data Classes ──────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class SystemInfo:
|
||||
"""Detected system resources."""
|
||||
total_gb: float
|
||||
available_gb: float
|
||||
gpu_memory_gb: Optional[float] = None
|
||||
gpu_name: Optional[str] = None
|
||||
is_apple_silicon: bool = False
|
||||
detection_method: str = ""
|
||||
|
||||
@classmethod
|
||||
def detect(cls) -> "SystemInfo":
|
||||
"""Detect available memory. Uses platform-appropriate methods."""
|
||||
system = platform.system()
|
||||
|
||||
if system == "Darwin":
|
||||
return cls._detect_macos()
|
||||
elif system == "Linux":
|
||||
return cls._detect_linux()
|
||||
else:
|
||||
return cls._detect_fallback(system)
|
||||
|
||||
@classmethod
|
||||
def _detect_macos(cls) -> "SystemInfo":
|
||||
"""Detect Apple Silicon unified memory."""
|
||||
info = cls(total_gb=0, available_gb=0, is_apple_silicon=True, detection_method="sysctl")
|
||||
try:
|
||||
r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
|
||||
if r.returncode == 0:
|
||||
info.total_gb = int(r.stdout.strip()) / (1024 ** 3)
|
||||
|
||||
r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
|
||||
if r.returncode == 0:
|
||||
page_size = 4096
|
||||
free_pages = 0
|
||||
for line in r.stdout.split("\n"):
|
||||
if "Pages free:" in line:
|
||||
try:
|
||||
free_pages = int(line.split(":")[-1].strip().rstrip("."))
|
||||
except ValueError:
|
||||
pass
|
||||
info.available_gb = (free_pages * page_size) / (1024 ** 3)
|
||||
|
||||
if info.available_gb < 1:
|
||||
info.available_gb = info.total_gb * 0.70
|
||||
|
||||
info.gpu_memory_gb = info.total_gb # Unified memory
|
||||
|
||||
r = subprocess.run(
|
||||
["system_profiler", "SPDisplaysDataType"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if r.returncode == 0:
|
||||
for line in r.stdout.split("\n"):
|
||||
if "Chipset" in line or "GPU" in line:
|
||||
info.gpu_name = line.split(":")[-1].strip()
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning("macOS detection failed: %s", e)
|
||||
info.total_gb = 16.0
|
||||
info.available_gb = 12.0
|
||||
info.detection_method = "fallback"
|
||||
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def _detect_linux(cls) -> "SystemInfo":
|
||||
"""Detect Linux with optional NVIDIA GPU."""
|
||||
info = cls(total_gb=0, available_gb=0, detection_method="proc")
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
info.total_gb = int(line.split()[1]) / (1024 * 1024)
|
||||
elif line.startswith("MemAvailable:"):
|
||||
info.available_gb = int(line.split()[1]) / (1024 * 1024)
|
||||
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
if r.returncode == 0 and r.stdout.strip():
|
||||
parts = r.stdout.strip().split("\n")[0].split(", ")
|
||||
if len(parts) >= 3:
|
||||
info.gpu_name = parts[0].strip()
|
||||
info.gpu_memory_gb = float(parts[1]) / 1024
|
||||
info.available_gb = max(info.available_gb, float(parts[2]) / 1024)
|
||||
info.detection_method = "nvidia-smi"
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("Linux detection failed: %s", e)
|
||||
info.total_gb = 16.0
|
||||
info.available_gb = 12.0
|
||||
info.detection_method = "fallback"
|
||||
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def _detect_fallback(cls, system: str) -> "SystemInfo":
|
||||
"""Fallback for unknown systems. Tries psutil."""
|
||||
try:
|
||||
import psutil
|
||||
mem = psutil.virtual_memory()
|
||||
return cls(
|
||||
total_gb=round(mem.total / (1024 ** 3), 1),
|
||||
available_gb=round(mem.available / (1024 ** 3), 1),
|
||||
detection_method="psutil",
|
||||
)
|
||||
except ImportError:
|
||||
return cls(total_gb=16.0, available_gb=12.0, detection_method="fallback")
|
||||
|
||||
|
||||
@dataclass
|
||||
class SelectionResult:
|
||||
"""Result of preset selection."""
|
||||
preset: str
|
||||
reason: str
|
||||
overhead_gb: float
|
||||
quality: str
|
||||
compression_ratio: float
|
||||
vllm_compatible: bool
|
||||
kv_type: str
|
||||
layer_adaptive: int
|
||||
env_vars: Dict[str, str] = field(default_factory=dict)
|
||||
server_flags: Dict[str, str] = field(default_factory=dict)
|
||||
warnings: list = field(default_factory=list)
|
||||
system_info: Optional[SystemInfo] = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = {
|
||||
"preset": self.preset,
|
||||
"reason": self.reason,
|
||||
"overhead_gb": round(self.overhead_gb, 2),
|
||||
"quality": self.quality,
|
||||
"compression_ratio": self.compression_ratio,
|
||||
"vllm_compatible": self.vllm_compatible,
|
||||
"kv_type": self.kv_type,
|
||||
"layer_adaptive": self.layer_adaptive,
|
||||
"env_vars": self.env_vars,
|
||||
"server_flags": self.server_flags,
|
||||
"warnings": self.warnings,
|
||||
}
|
||||
if self.system_info:
|
||||
d["system"] = {
|
||||
"total_gb": round(self.system_info.total_gb, 1),
|
||||
"available_gb": round(self.system_info.available_gb, 1),
|
||||
"gpu_name": self.system_info.gpu_name,
|
||||
"is_apple_silicon": self.system_info.is_apple_silicon,
|
||||
"detection_method": self.system_info.detection_method,
|
||||
}
|
||||
return d
|
||||
|
||||
|
||||
# ── Selection Logic ───────────────────────────────────────────────────────────
|
||||
|
||||
def select_preset(
|
||||
available_gb: float,
|
||||
model_size_gb: float,
|
||||
require_vllm: bool = False,
|
||||
) -> SelectionResult:
|
||||
"""Select the best TurboQuant preset based on memory overhead.
|
||||
|
||||
Args:
|
||||
available_gb: Available system memory in GB
|
||||
model_size_gb: Model size in GB
|
||||
require_vllm: If True, only select vLLM-compatible presets
|
||||
|
||||
Returns:
|
||||
SelectionResult with chosen preset and reasoning
|
||||
"""
|
||||
overhead_gb = available_gb - model_size_gb
|
||||
|
||||
if overhead_gb < 0:
|
||||
logger.warning(
|
||||
"Insufficient memory: need %.1f GB, have %.1f GB available",
|
||||
model_size_gb, available_gb
|
||||
)
|
||||
return _make_result("q4_0", overhead_gb,
|
||||
reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback")
|
||||
|
||||
# Walk quality order (best first), pick first that fits
|
||||
for preset_name in QUALITY_ORDER:
|
||||
preset = PRESETS[preset_name]
|
||||
|
||||
if require_vllm and not preset["vllm_compatible"]:
|
||||
continue
|
||||
|
||||
if overhead_gb >= preset["min_overhead_gb"]:
|
||||
reason = (
|
||||
f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB "
|
||||
f"required for {preset['name']}"
|
||||
)
|
||||
return _make_result(preset_name, overhead_gb, reason=reason)
|
||||
|
||||
# Nothing fits — aggressive fallback
|
||||
return _make_result("q4_0", overhead_gb,
|
||||
reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback")
|
||||
|
||||
|
||||
def _make_result(preset_name: str, overhead_gb: float, reason: str) -> SelectionResult:
|
||||
"""Build a SelectionResult from a preset name."""
|
||||
p = PRESETS[preset_name]
|
||||
|
||||
env_vars = {"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])}
|
||||
server_flags = {"-ctk": p["kv_type"], "-ctv": p["kv_type"]}
|
||||
|
||||
warnings = []
|
||||
if overhead_gb < 2.0 and overhead_gb >= 0:
|
||||
warnings.append(f"Low headroom ({overhead_gb:.1f} GB). Consider reducing context length.")
|
||||
if overhead_gb < 0:
|
||||
warnings.append(
|
||||
f"OVERCOMMITTED: model needs {abs(overhead_gb):.1f} GB more than available. "
|
||||
f"Inference may fail or swap heavily."
|
||||
)
|
||||
|
||||
return SelectionResult(
|
||||
preset=preset_name,
|
||||
reason=reason,
|
||||
overhead_gb=overhead_gb,
|
||||
quality=p["quality"],
|
||||
compression_ratio=p["compression_ratio"],
|
||||
vllm_compatible=p["vllm_compatible"],
|
||||
kv_type=p["kv_type"],
|
||||
layer_adaptive=p["layer_adaptive"],
|
||||
env_vars=env_vars,
|
||||
server_flags=server_flags,
|
||||
warnings=warnings,
|
||||
)
|
||||
|
||||
|
||||
# ── Auto-Select (with system detection + config) ─────────────────────────────
|
||||
|
||||
def auto_select(
|
||||
model_size_gb: float,
|
||||
config_override: Optional[str] = None,
|
||||
require_vllm: bool = False,
|
||||
config_path: Optional[str] = None,
|
||||
) -> SelectionResult:
|
||||
"""Auto-select preset based on system detection.
|
||||
|
||||
Args:
|
||||
model_size_gb: Model size in GB
|
||||
config_override: Force a specific preset (skips detection)
|
||||
require_vllm: Require vLLM compatibility
|
||||
config_path: Path to config file (default: $HERMES_HOME/turboquant.json)
|
||||
|
||||
Returns:
|
||||
SelectionResult
|
||||
"""
|
||||
# Load config file if exists
|
||||
config = _load_config(config_path)
|
||||
|
||||
# Config override takes precedence
|
||||
override = config_override or config.get("preset_override")
|
||||
if override:
|
||||
if override in PRESETS:
|
||||
logger.info("Using config override: %s", override)
|
||||
p = PRESETS[override]
|
||||
return SelectionResult(
|
||||
preset=override,
|
||||
reason=f"Config override: {p['name']}",
|
||||
overhead_gb=0,
|
||||
quality=p["quality"],
|
||||
compression_ratio=p["compression_ratio"],
|
||||
vllm_compatible=p["vllm_compatible"],
|
||||
kv_type=p["kv_type"],
|
||||
layer_adaptive=p["layer_adaptive"],
|
||||
env_vars={"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])},
|
||||
server_flags={"-ctk": p["kv_type"], "-ctv": p["kv_type"]},
|
||||
)
|
||||
else:
|
||||
logger.warning("Unknown preset override '%s', falling back to auto-select", override)
|
||||
|
||||
# Detect system
|
||||
sys_info = SystemInfo.detect()
|
||||
logger.info(
|
||||
"System: %.1f GB total, %.1f GB available, model: %.1f GB",
|
||||
sys_info.total_gb, sys_info.available_gb, model_size_gb
|
||||
)
|
||||
|
||||
# Select
|
||||
result = select_preset(
|
||||
available_gb=sys_info.available_gb,
|
||||
model_size_gb=model_size_gb,
|
||||
require_vllm=require_vllm,
|
||||
)
|
||||
result.system_info = sys_info
|
||||
|
||||
# Apply context length from config
|
||||
context_length = config.get("context_length")
|
||||
if context_length:
|
||||
result.server_flags["-c"] = str(context_length)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _load_config(config_path: Optional[str] = None) -> dict:
|
||||
"""Load turboquant config from JSON file."""
|
||||
if config_path:
|
||||
p = Path(config_path)
|
||||
else:
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
p = get_hermes_home() / "turboquant.json"
|
||||
except (ImportError, Exception):
|
||||
p = Path.home() / ".hermes" / "turboquant.json"
|
||||
|
||||
if p.exists():
|
||||
try:
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_config(values: dict, config_path: Optional[str] = None) -> None:
|
||||
"""Save turboquant config to JSON file."""
|
||||
if config_path:
|
||||
p = Path(config_path)
|
||||
else:
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
p = get_hermes_home() / "turboquant.json"
|
||||
except (ImportError, Exception):
|
||||
p = Path.home() / ".hermes" / "turboquant.json"
|
||||
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
existing = {}
|
||||
if p.exists():
|
||||
try:
|
||||
existing = json.loads(p.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
existing.update(values)
|
||||
p.write_text(json.dumps(existing, indent=2), encoding="utf-8")
|
||||
logger.info("Config saved to %s", p)
|
||||
|
||||
|
||||
# ── Utility ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_preset_info(preset_name: str) -> Optional[dict]:
|
||||
"""Get information about a preset."""
|
||||
return PRESETS.get(preset_name)
|
||||
|
||||
|
||||
def list_presets() -> dict:
|
||||
"""List all available presets."""
|
||||
return PRESETS.copy()
|
||||
|
||||
|
||||
def format_env_commands(result: SelectionResult) -> str:
|
||||
"""Format result as shell export commands."""
|
||||
lines = [f"# TurboQuant preset: {result.preset} ({result.quality}, {result.compression_ratio}x)"]
|
||||
lines.append(f"# {result.reason}")
|
||||
for k, v in result.env_vars.items():
|
||||
lines.append(f"export {k}={v}")
|
||||
flags = " ".join(f"{k} {v}" for k, v in result.server_flags.items())
|
||||
if flags:
|
||||
lines.append(f"# llama-server flags: {flags}")
|
||||
for w in result.warnings:
|
||||
lines.append(f"# WARNING: {w}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Auto-select TurboQuant preset based on available memory",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s --model-size 14.0 # Auto-detect and select
|
||||
%(prog)s --model-size 7.0 --json # JSON output
|
||||
%(prog)s --model-size 7.0 --list # List available presets
|
||||
%(prog)s --model-size 7.0 --require-vllm
|
||||
%(prog)s --model-size 7.0 --preset turboquant_4bit_nc # Override
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--model-size", type=float, help="Model size in GB")
|
||||
parser.add_argument("--preset", help="Force a specific preset (skip detection)")
|
||||
parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
|
||||
parser.add_argument("--json", action="store_true", help="JSON output")
|
||||
parser.add_argument("--list", action="store_true", help="List available presets")
|
||||
parser.add_argument("--shell", action="store_true", help="Output as shell export commands")
|
||||
parser.add_argument("--detect-only", action="store_true", help="Only detect hardware")
|
||||
parser.add_argument("--config", help="Path to config file")
|
||||
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
|
||||
if args.list:
|
||||
print("Available presets:")
|
||||
print(f" {'Name':22} {'Quality':8} {'Compress':8} {'vLLM':5} {'Overhead':8} Description")
|
||||
for name in QUALITY_ORDER:
|
||||
p = PRESETS[name]
|
||||
vllm = "yes" if p["vllm_compatible"] else "no"
|
||||
print(
|
||||
f" {name:22} {p['quality']:8} {p['compression_ratio']:.1f}x "
|
||||
f"{vllm:5} >= {p['min_overhead_gb']:2.0f} GB {p['description']}"
|
||||
)
|
||||
return
|
||||
|
||||
if args.detect_only:
|
||||
info = SystemInfo.detect()
|
||||
if args.json:
|
||||
print(json.dumps({
|
||||
"total_gb": round(info.total_gb, 1),
|
||||
"available_gb": round(info.available_gb, 1),
|
||||
"gpu_memory_gb": round(info.gpu_memory_gb, 1) if info.gpu_memory_gb else None,
|
||||
"gpu_name": info.gpu_name,
|
||||
"is_apple_silicon": info.is_apple_silicon,
|
||||
"detection_method": info.detection_method,
|
||||
}, indent=2))
|
||||
else:
|
||||
print(f"Total memory: {info.total_gb:.1f} GB")
|
||||
print(f"Available: {info.available_gb:.1f} GB")
|
||||
if info.gpu_memory_gb:
|
||||
print(f"GPU memory: {info.gpu_memory_gb:.1f} GB")
|
||||
if info.gpu_name:
|
||||
print(f"GPU: {info.gpu_name}")
|
||||
if info.is_apple_silicon:
|
||||
print(f"Chip: Apple Silicon")
|
||||
print(f"Detection: {info.detection_method}")
|
||||
return
|
||||
|
||||
if not args.model_size:
|
||||
parser.error("--model-size is required (unless using --list or --detect-only)")
|
||||
|
||||
result = auto_select(
|
||||
model_size_gb=args.model_size,
|
||||
config_override=args.preset,
|
||||
require_vllm=args.require_vllm,
|
||||
config_path=args.config,
|
||||
)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result.to_dict(), indent=2))
|
||||
elif args.shell:
|
||||
print(format_env_commands(result))
|
||||
else:
|
||||
print(f"Selected: {result.preset} ({result.quality}, {result.compression_ratio}x)")
|
||||
print(f" {result.reason}")
|
||||
if result.system_info:
|
||||
print(f" System: {result.system_info.total_gb:.0f} GB total, {result.system_info.available_gb:.0f} GB available")
|
||||
print(f" Overhead: {result.overhead_gb:.1f} GB")
|
||||
print()
|
||||
print(f"Environment:")
|
||||
for k, v in result.env_vars.items():
|
||||
print(f" export {k}={v}")
|
||||
print(f"Server flags:")
|
||||
for k, v in result.server_flags.items():
|
||||
print(f" {k} {v}")
|
||||
if result.warnings:
|
||||
print()
|
||||
for w in result.warnings:
|
||||
print(f" WARNING: {w}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user