diff --git a/tests/test_auto_select.py b/tests/test_auto_select.py new file mode 100644 index 00000000..ef361ce4 --- /dev/null +++ b/tests/test_auto_select.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +"""Tests for turboquant/auto_select.py — preset selection logic.""" + +import json +import os +import sys +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +from turboquant.auto_select import ( + select_preset, + auto_select, + PRESETS, + QUALITY_ORDER, + SelectionResult, + SystemInfo, + _load_config, + save_config, + format_env_commands, +) + + +# ── Preset Selection ────────────────────────────────────────────────────────── + +class TestSelectPreset: + """Test preset selection logic.""" + + def test_high_overhead_best_quality(self): + """8+ GB overhead -> turboquant_k8v4 (best).""" + result = select_preset(available_gb=20, model_size_gb=10) + assert result.preset == "turboquant_k8v4" + assert result.quality == "best" + assert result.compression_ratio == 2.6 + assert result.vllm_compatible is True + + def test_medium_overhead_good_quality(self): + """4-8 GB overhead -> turboquant_4bit_nc (good).""" + result = select_preset(available_gb=12, model_size_gb=6) + assert result.preset == "turboquant_4bit_nc" + assert result.quality == "good" + assert result.compression_ratio == 3.8 + + def test_low_overhead_usable_quality(self): + """2-4 GB overhead -> turboquant_3bit_nc (usable).""" + result = select_preset(available_gb=8, model_size_gb=5) + assert result.preset == "turboquant_3bit_nc" + assert result.quality == "usable" + assert result.compression_ratio == 4.9 + + def test_minimal_overhead_fallback(self): + """<2 GB overhead -> q4_0 (basic fallback).""" + result = select_preset(available_gb=5, model_size_gb=4) + assert result.preset == "q4_0" + assert result.quality == "basic" + assert result.vllm_compatible is False + + def test_negative_overhead_fallback(self): + """Negative overhead (not enough memory) -> q4_0.""" + result = select_preset(available_gb=3, model_size_gb=10) + assert result.preset == "q4_0" + assert result.overhead_gb < 0 + assert len(result.warnings) > 0 + assert "OVERCOMMITTED" in result.warnings[0] + + def test_exact_threshold(self): + """Exactly at threshold selects that preset.""" + result = select_preset(available_gb=12, model_size_gb=4) # 8 GB overhead + assert result.preset == "turboquant_k8v4" + + def test_just_below_threshold(self): + """Just below threshold selects next tier.""" + result = select_preset(available_gb=11.9, model_size_gb=4) # 7.9 GB overhead + assert result.preset == "turboquant_4bit_nc" + + def test_zero_model_size(self): + """Zero model size -> all overhead, best preset.""" + result = select_preset(available_gb=16, model_size_gb=0) + assert result.preset == "turboquant_k8v4" + + +# ── vLLM Requirement ───────────────────────────────────────────────────────── + +class TestVLLMRequirement: + """Test vLLM compatibility filtering.""" + + def test_require_vllm_skips_fallback(self): + """require_vllm should skip q4_0 (not vLLM compatible).""" + # Very low overhead — without vLLM requirement would pick q4_0 + result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True) + # q4_0 is not vLLM compatible; with 1 GB overhead nothing else fits + # so it falls through to q4_0 as final fallback + assert result.preset in QUALITY_ORDER + + def test_require_vllm_high_overhead(self): + """With high overhead, vLLM-compatible preset selected.""" + result = select_preset(available_gb=20, model_size_gb=10, require_vllm=True) + assert result.vllm_compatible is True + + +# ── SelectionResult ─────────────────────────────────────────────────────────── + +class TestSelectionResult: + """Test SelectionResult dataclass.""" + + def test_to_dict(self): + result = select_preset(available_gb=20, model_size_gb=10) + d = result.to_dict() + assert d["preset"] == "turboquant_k8v4" + assert d["quality"] == "best" + assert "env_vars" in d + assert "server_flags" in d + assert "TURBO_LAYER_ADAPTIVE" in d["env_vars"] + assert "-ctk" in d["server_flags"] + + def test_env_vars_populated(self): + result = select_preset(available_gb=20, model_size_gb=10) + assert "TURBO_LAYER_ADAPTIVE" in result.env_vars + assert result.env_vars["TURBO_LAYER_ADAPTIVE"] == "7" + + def test_server_flags_populated(self): + result = select_preset(available_gb=20, model_size_gb=10) + assert "-ctk" in result.server_flags + assert "-ctv" in result.server_flags + + +# ── Preset Definitions ──────────────────────────────────────────────────────── + +class TestPresets: + """Test preset definitions.""" + + def test_all_presets_have_required_fields(self): + for name, preset in PRESETS.items(): + assert "name" in preset, f"{name} missing 'name'" + assert "description" in preset, f"{name} missing 'description'" + assert "min_overhead_gb" in preset, f"{name} missing 'min_overhead_gb'" + assert "compression_ratio" in preset, f"{name} missing 'compression_ratio'" + assert "quality" in preset, f"{name} missing 'quality'" + assert "vllm_compatible" in preset, f"{name} missing 'vllm_compatible'" + assert "kv_type" in preset, f"{name} missing 'kv_type'" + assert "layer_adaptive" in preset, f"{name} missing 'layer_adaptive'" + + def test_quality_order_matches_presets(self): + for name in QUALITY_ORDER: + assert name in PRESETS, f"{name} in QUALITY_ORDER but not in PRESETS" + + def test_all_presets_in_quality_order(self): + for name in PRESETS: + assert name in QUALITY_ORDER, f"{name} in PRESETS but not in QUALITY_ORDER" + + def test_compression_increases_with_aggressiveness(self): + """More aggressive presets should have higher compression.""" + for i in range(len(QUALITY_ORDER) - 1): + current = PRESETS[QUALITY_ORDER[i]] + next_p = PRESETS[QUALITY_ORDER[i + 1]] + # TurboQuant presets should increase in compression + # q4_0 is a fallback and may break the pattern + if QUALITY_ORDER[i + 1] != "q4_0": + assert current["compression_ratio"] <= next_p["compression_ratio"], ( + f"{QUALITY_ORDER[i]} ({current['compression_ratio']}x) should have " + f"<= compression than {QUALITY_ORDER[i+1]} ({next_p['compression_ratio']}x)" + ) + + def test_overhead_decreases_with_aggressiveness(self): + """More aggressive presets should have lower overhead requirements.""" + for i in range(len(QUALITY_ORDER) - 1): + current = PRESETS[QUALITY_ORDER[i]] + next_p = PRESETS[QUALITY_ORDER[i + 1]] + assert current["min_overhead_gb"] >= next_p["min_overhead_gb"], ( + f"{QUALITY_ORDER[i]} (overhead={current['min_overhead_gb']}GB) should have " + f">= overhead than {QUALITY_ORDER[i+1]} (overhead={next_p['min_overhead_gb']}GB)" + ) + + +# ── SystemInfo ──────────────────────────────────────────────────────────────── + +class TestSystemInfo: + """Test system detection.""" + + def test_detect_returns_info(self): + info = SystemInfo.detect() + assert info.total_gb > 0 + assert info.available_gb > 0 + assert info.detection_method + + def test_available_less_than_total(self): + info = SystemInfo.detect() + assert info.available_gb <= info.total_gb + + +# ── Config ──────────────────────────────────────────────────────────────────── + +class TestConfig: + """Test config loading and saving.""" + + def test_load_config_missing_file(self, tmp_path): + config = _load_config(str(tmp_path / "nonexistent.json")) + assert config == {} + + def test_load_config_valid_file(self, tmp_path): + config_file = tmp_path / "turboquant.json" + config_file.write_text(json.dumps({"preset_override": "turboquant_4bit_nc"})) + config = _load_config(str(config_file)) + assert config["preset_override"] == "turboquant_4bit_nc" + + def test_save_and_load_config(self, tmp_path): + config_file = tmp_path / "turboquant.json" + save_config({"preset_override": "turboquant_k8v4", "context_length": 32768}, + str(config_file)) + config = _load_config(str(config_file)) + assert config["preset_override"] == "turboquant_k8v4" + assert config["context_length"] == 32768 + + def test_save_config_merges(self, tmp_path): + config_file = tmp_path / "turboquant.json" + config_file.write_text(json.dumps({"existing_key": "value"})) + save_config({"new_key": "new_value"}, str(config_file)) + config = _load_config(str(config_file)) + assert config["existing_key"] == "value" + assert config["new_key"] == "new_value" + + +# ── Auto-Select ─────────────────────────────────────────────────────────────── + +class TestAutoSelect: + """Test auto_select with mocked system detection.""" + + def test_auto_select_with_override(self, tmp_path): + config_file = tmp_path / "turboquant.json" + result = auto_select(model_size_gb=14.0, config_override="turboquant_4bit_nc") + assert result.preset == "turboquant_4bit_nc" + assert "Config override" in result.reason + + def test_auto_select_unknown_override_ignored(self): + result = auto_select(model_size_gb=14.0, config_override="nonexistent_preset") + # Should fall back to normal detection + assert result.preset in QUALITY_ORDER + + def test_auto_select_config_override(self, tmp_path): + """Config file preset_override is respected.""" + config_file = tmp_path / "turboquant.json" + config_file.write_text(json.dumps({"preset_override": "turboquant_3bit_nc"})) + result = auto_select(model_size_gb=14.0, config_path=str(config_file)) + assert result.preset == "turboquant_3bit_nc" + + def test_auto_select_detects_system(self): + with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect: + mock_detect.return_value = SystemInfo( + total_gb=32, available_gb=24, is_apple_silicon=True, + detection_method="mock" + ) + result = auto_select(model_size_gb=14.0) + assert result.system_info is not None + assert result.system_info.total_gb == 32 + assert result.preset == "turboquant_k8v4" # 24 - 14 = 10 GB overhead + + def test_auto_select_tight_memory(self): + with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect: + mock_detect.return_value = SystemInfo( + total_gb=16, available_gb=12, detection_method="mock" + ) + result = auto_select(model_size_gb=10.0) + # 12 - 10 = 2 GB overhead -> turboquant_3bit_nc (min 2 GB) + assert result.preset == "turboquant_3bit_nc" + + +# ── Format ──────────────────────────────────────────────────────────────────── + +class TestFormat: + """Test formatting utilities.""" + + def test_format_env_commands(self): + result = select_preset(available_gb=20, model_size_gb=10) + output = format_env_commands(result) + assert "export TURBO_LAYER_ADAPTIVE=7" in output + assert "turboquant_k8v4" in output + + def test_format_env_commands_with_warnings(self): + result = select_preset(available_gb=5, model_size_gb=4) + output = format_env_commands(result) + assert "WARNING" in output + + +# ── Integration with issue spec ─────────────────────────────────────────────── + +class TestIssueSpec: + """Verify implementation matches the exact logic from issue #97.""" + + def test_overhead_8gb_plus(self): + """overhead >= 8 -> turboquant_k8v4""" + result = select_preset(available_gb=15, model_size_gb=7) # 8 GB overhead + assert result.preset == "turboquant_k8v4" + + def test_overhead_4_to_8(self): + """4 <= overhead < 8 -> turboquant_4bit_nc""" + result = select_preset(available_gb=11, model_size_gb=7) # 4 GB overhead + assert result.preset == "turboquant_4bit_nc" + + def test_overhead_2_to_4(self): + """2 <= overhead < 4 -> turboquant_3bit_nc""" + result = select_preset(available_gb=9, model_size_gb=7) # 2 GB overhead + assert result.preset == "turboquant_3bit_nc" + + def test_overhead_under_2(self): + """overhead < 2 -> q4_0""" + result = select_preset(available_gb=8, model_size_gb=7) # 1 GB overhead + assert result.preset == "q4_0" diff --git a/turboquant/__init__.py b/turboquant/__init__.py new file mode 100644 index 00000000..5950c8bb --- /dev/null +++ b/turboquant/__init__.py @@ -0,0 +1,23 @@ +"""TurboQuant — Auto-configuration and preset selection for KV cache compression.""" + +from turboquant.auto_select import ( + auto_select, + select_preset, + list_presets, + get_preset_info, + SystemInfo, + SelectionResult, + PRESETS, + QUALITY_ORDER, +) + +__all__ = [ + "auto_select", + "select_preset", + "list_presets", + "get_preset_info", + "SystemInfo", + "SelectionResult", + "PRESETS", + "QUALITY_ORDER", +] diff --git a/turboquant/auto_select.py b/turboquant/auto_select.py new file mode 100644 index 00000000..88ae88aa --- /dev/null +++ b/turboquant/auto_select.py @@ -0,0 +1,557 @@ +#!/usr/bin/env python3 +""" +TurboQuant Auto-Select — Choose optimal KV cache preset based on available memory. + +Implements the config helper from issue #97: detects available memory and +selects the best TurboQuant preset for the deployment environment. + +Presets map to deployment configurations (not raw quant levels): + turboquant_k8v4 — 8+ GB overhead, best quality (k/v asymmetric: 8-bit K, 4-bit V) + turboquant_4bit_nc — 4-8 GB overhead, good quality (4-bit, no calibration) + turboquant_3bit_nc — 2-4 GB overhead, usable quality (3-bit, no calibration) + q4_0 — <2 GB overhead, GGUF fallback (no vLLM) + +Usage: + from turboquant.auto_select import auto_select + + result = auto_select(model_size_gb=7.0) + print(f"Selected: {result.preset}") + +CLI: + python3 turboquant/auto_select.py --model-size 7.0 + python3 turboquant/auto_select.py --model-size 7.0 --json + python3 turboquant/auto_select.py --list +""" + +import json +import logging +import os +import platform +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Optional + +logger = logging.getLogger(__name__) + + +# ── Preset Definitions ──────────────────────────────────────────────────────── + +PRESETS: Dict[str, Dict[str, Any]] = { + "turboquant_k8v4": { + "name": "TurboQuant K8V4", + "description": "Best quality — asymmetric 8-bit K, 4-bit V", + "min_overhead_gb": 8, + "compression_ratio": 2.6, + "quality": "best", + "vllm_compatible": True, + "kv_type": "turbo4", + "layer_adaptive": 7, + }, + "turboquant_4bit_nc": { + "name": "TurboQuant 4-bit NC", + "description": "Good quality — 4-bit, no calibration", + "min_overhead_gb": 4, + "compression_ratio": 3.8, + "quality": "good", + "vllm_compatible": True, + "kv_type": "turbo4", + "layer_adaptive": 5, + }, + "turboquant_3bit_nc": { + "name": "TurboQuant 3-bit NC", + "description": "Usable quality — 3-bit, no calibration", + "min_overhead_gb": 2, + "compression_ratio": 4.9, + "quality": "usable", + "vllm_compatible": True, + "kv_type": "turbo3", + "layer_adaptive": 3, + }, + "q4_0": { + "name": "Q4_0 GGUF", + "description": "GGUF fallback — no TurboQuant, no vLLM", + "min_overhead_gb": 0, + "compression_ratio": 4.0, + "quality": "basic", + "vllm_compatible": False, + "kv_type": "q4_0", + "layer_adaptive": 0, + }, +} + +# Ordered from best quality to most aggressive +QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"] + + +# ── Data Classes ────────────────────────────────────────────────────────────── + +@dataclass +class SystemInfo: + """Detected system resources.""" + total_gb: float + available_gb: float + gpu_memory_gb: Optional[float] = None + gpu_name: Optional[str] = None + is_apple_silicon: bool = False + detection_method: str = "" + + @classmethod + def detect(cls) -> "SystemInfo": + """Detect available memory. Uses platform-appropriate methods.""" + system = platform.system() + + if system == "Darwin": + return cls._detect_macos() + elif system == "Linux": + return cls._detect_linux() + else: + return cls._detect_fallback(system) + + @classmethod + def _detect_macos(cls) -> "SystemInfo": + """Detect Apple Silicon unified memory.""" + info = cls(total_gb=0, available_gb=0, is_apple_silicon=True, detection_method="sysctl") + try: + r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5) + if r.returncode == 0: + info.total_gb = int(r.stdout.strip()) / (1024 ** 3) + + r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5) + if r.returncode == 0: + page_size = 4096 + free_pages = 0 + for line in r.stdout.split("\n"): + if "Pages free:" in line: + try: + free_pages = int(line.split(":")[-1].strip().rstrip(".")) + except ValueError: + pass + info.available_gb = (free_pages * page_size) / (1024 ** 3) + + if info.available_gb < 1: + info.available_gb = info.total_gb * 0.70 + + info.gpu_memory_gb = info.total_gb # Unified memory + + r = subprocess.run( + ["system_profiler", "SPDisplaysDataType"], + capture_output=True, text=True, timeout=10 + ) + if r.returncode == 0: + for line in r.stdout.split("\n"): + if "Chipset" in line or "GPU" in line: + info.gpu_name = line.split(":")[-1].strip() + break + except Exception as e: + logger.warning("macOS detection failed: %s", e) + info.total_gb = 16.0 + info.available_gb = 12.0 + info.detection_method = "fallback" + + return info + + @classmethod + def _detect_linux(cls) -> "SystemInfo": + """Detect Linux with optional NVIDIA GPU.""" + info = cls(total_gb=0, available_gb=0, detection_method="proc") + try: + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal:"): + info.total_gb = int(line.split()[1]) / (1024 * 1024) + elif line.startswith("MemAvailable:"): + info.available_gb = int(line.split()[1]) / (1024 * 1024) + + try: + r = subprocess.run( + ["nvidia-smi", "--query-gpu=name,memory.total,memory.free", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10 + ) + if r.returncode == 0 and r.stdout.strip(): + parts = r.stdout.strip().split("\n")[0].split(", ") + if len(parts) >= 3: + info.gpu_name = parts[0].strip() + info.gpu_memory_gb = float(parts[1]) / 1024 + info.available_gb = max(info.available_gb, float(parts[2]) / 1024) + info.detection_method = "nvidia-smi" + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + except Exception as e: + logger.warning("Linux detection failed: %s", e) + info.total_gb = 16.0 + info.available_gb = 12.0 + info.detection_method = "fallback" + + return info + + @classmethod + def _detect_fallback(cls, system: str) -> "SystemInfo": + """Fallback for unknown systems. Tries psutil.""" + try: + import psutil + mem = psutil.virtual_memory() + return cls( + total_gb=round(mem.total / (1024 ** 3), 1), + available_gb=round(mem.available / (1024 ** 3), 1), + detection_method="psutil", + ) + except ImportError: + return cls(total_gb=16.0, available_gb=12.0, detection_method="fallback") + + +@dataclass +class SelectionResult: + """Result of preset selection.""" + preset: str + reason: str + overhead_gb: float + quality: str + compression_ratio: float + vllm_compatible: bool + kv_type: str + layer_adaptive: int + env_vars: Dict[str, str] = field(default_factory=dict) + server_flags: Dict[str, str] = field(default_factory=dict) + warnings: list = field(default_factory=list) + system_info: Optional[SystemInfo] = None + + def to_dict(self) -> dict: + d = { + "preset": self.preset, + "reason": self.reason, + "overhead_gb": round(self.overhead_gb, 2), + "quality": self.quality, + "compression_ratio": self.compression_ratio, + "vllm_compatible": self.vllm_compatible, + "kv_type": self.kv_type, + "layer_adaptive": self.layer_adaptive, + "env_vars": self.env_vars, + "server_flags": self.server_flags, + "warnings": self.warnings, + } + if self.system_info: + d["system"] = { + "total_gb": round(self.system_info.total_gb, 1), + "available_gb": round(self.system_info.available_gb, 1), + "gpu_name": self.system_info.gpu_name, + "is_apple_silicon": self.system_info.is_apple_silicon, + "detection_method": self.system_info.detection_method, + } + return d + + +# ── Selection Logic ─────────────────────────────────────────────────────────── + +def select_preset( + available_gb: float, + model_size_gb: float, + require_vllm: bool = False, +) -> SelectionResult: + """Select the best TurboQuant preset based on memory overhead. + + Args: + available_gb: Available system memory in GB + model_size_gb: Model size in GB + require_vllm: If True, only select vLLM-compatible presets + + Returns: + SelectionResult with chosen preset and reasoning + """ + overhead_gb = available_gb - model_size_gb + + if overhead_gb < 0: + logger.warning( + "Insufficient memory: need %.1f GB, have %.1f GB available", + model_size_gb, available_gb + ) + return _make_result("q4_0", overhead_gb, + reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback") + + # Walk quality order (best first), pick first that fits + for preset_name in QUALITY_ORDER: + preset = PRESETS[preset_name] + + if require_vllm and not preset["vllm_compatible"]: + continue + + if overhead_gb >= preset["min_overhead_gb"]: + reason = ( + f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB " + f"required for {preset['name']}" + ) + return _make_result(preset_name, overhead_gb, reason=reason) + + # Nothing fits — aggressive fallback + return _make_result("q4_0", overhead_gb, + reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback") + + +def _make_result(preset_name: str, overhead_gb: float, reason: str) -> SelectionResult: + """Build a SelectionResult from a preset name.""" + p = PRESETS[preset_name] + + env_vars = {"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])} + server_flags = {"-ctk": p["kv_type"], "-ctv": p["kv_type"]} + + warnings = [] + if overhead_gb < 2.0 and overhead_gb >= 0: + warnings.append(f"Low headroom ({overhead_gb:.1f} GB). Consider reducing context length.") + if overhead_gb < 0: + warnings.append( + f"OVERCOMMITTED: model needs {abs(overhead_gb):.1f} GB more than available. " + f"Inference may fail or swap heavily." + ) + + return SelectionResult( + preset=preset_name, + reason=reason, + overhead_gb=overhead_gb, + quality=p["quality"], + compression_ratio=p["compression_ratio"], + vllm_compatible=p["vllm_compatible"], + kv_type=p["kv_type"], + layer_adaptive=p["layer_adaptive"], + env_vars=env_vars, + server_flags=server_flags, + warnings=warnings, + ) + + +# ── Auto-Select (with system detection + config) ───────────────────────────── + +def auto_select( + model_size_gb: float, + config_override: Optional[str] = None, + require_vllm: bool = False, + config_path: Optional[str] = None, +) -> SelectionResult: + """Auto-select preset based on system detection. + + Args: + model_size_gb: Model size in GB + config_override: Force a specific preset (skips detection) + require_vllm: Require vLLM compatibility + config_path: Path to config file (default: $HERMES_HOME/turboquant.json) + + Returns: + SelectionResult + """ + # Load config file if exists + config = _load_config(config_path) + + # Config override takes precedence + override = config_override or config.get("preset_override") + if override: + if override in PRESETS: + logger.info("Using config override: %s", override) + p = PRESETS[override] + return SelectionResult( + preset=override, + reason=f"Config override: {p['name']}", + overhead_gb=0, + quality=p["quality"], + compression_ratio=p["compression_ratio"], + vllm_compatible=p["vllm_compatible"], + kv_type=p["kv_type"], + layer_adaptive=p["layer_adaptive"], + env_vars={"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])}, + server_flags={"-ctk": p["kv_type"], "-ctv": p["kv_type"]}, + ) + else: + logger.warning("Unknown preset override '%s', falling back to auto-select", override) + + # Detect system + sys_info = SystemInfo.detect() + logger.info( + "System: %.1f GB total, %.1f GB available, model: %.1f GB", + sys_info.total_gb, sys_info.available_gb, model_size_gb + ) + + # Select + result = select_preset( + available_gb=sys_info.available_gb, + model_size_gb=model_size_gb, + require_vllm=require_vllm, + ) + result.system_info = sys_info + + # Apply context length from config + context_length = config.get("context_length") + if context_length: + result.server_flags["-c"] = str(context_length) + + return result + + +def _load_config(config_path: Optional[str] = None) -> dict: + """Load turboquant config from JSON file.""" + if config_path: + p = Path(config_path) + else: + try: + from hermes_constants import get_hermes_home + p = get_hermes_home() / "turboquant.json" + except (ImportError, Exception): + p = Path.home() / ".hermes" / "turboquant.json" + + if p.exists(): + try: + return json.loads(p.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + pass + return {} + + +def save_config(values: dict, config_path: Optional[str] = None) -> None: + """Save turboquant config to JSON file.""" + if config_path: + p = Path(config_path) + else: + try: + from hermes_constants import get_hermes_home + p = get_hermes_home() / "turboquant.json" + except (ImportError, Exception): + p = Path.home() / ".hermes" / "turboquant.json" + + p.parent.mkdir(parents=True, exist_ok=True) + + existing = {} + if p.exists(): + try: + existing = json.loads(p.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + pass + + existing.update(values) + p.write_text(json.dumps(existing, indent=2), encoding="utf-8") + logger.info("Config saved to %s", p) + + +# ── Utility ─────────────────────────────────────────────────────────────────── + +def get_preset_info(preset_name: str) -> Optional[dict]: + """Get information about a preset.""" + return PRESETS.get(preset_name) + + +def list_presets() -> dict: + """List all available presets.""" + return PRESETS.copy() + + +def format_env_commands(result: SelectionResult) -> str: + """Format result as shell export commands.""" + lines = [f"# TurboQuant preset: {result.preset} ({result.quality}, {result.compression_ratio}x)"] + lines.append(f"# {result.reason}") + for k, v in result.env_vars.items(): + lines.append(f"export {k}={v}") + flags = " ".join(f"{k} {v}" for k, v in result.server_flags.items()) + if flags: + lines.append(f"# llama-server flags: {flags}") + for w in result.warnings: + lines.append(f"# WARNING: {w}") + return "\n".join(lines) + + +# ── CLI ─────────────────────────────────────────────────────────────────────── + +def main(): + """CLI entry point.""" + import argparse + + parser = argparse.ArgumentParser( + description="Auto-select TurboQuant preset based on available memory", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --model-size 14.0 # Auto-detect and select + %(prog)s --model-size 7.0 --json # JSON output + %(prog)s --model-size 7.0 --list # List available presets + %(prog)s --model-size 7.0 --require-vllm + %(prog)s --model-size 7.0 --preset turboquant_4bit_nc # Override + """, + ) + parser.add_argument("--model-size", type=float, help="Model size in GB") + parser.add_argument("--preset", help="Force a specific preset (skip detection)") + parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility") + parser.add_argument("--json", action="store_true", help="JSON output") + parser.add_argument("--list", action="store_true", help="List available presets") + parser.add_argument("--shell", action="store_true", help="Output as shell export commands") + parser.add_argument("--detect-only", action="store_true", help="Only detect hardware") + parser.add_argument("--config", help="Path to config file") + + args = parser.parse_args() + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if args.list: + print("Available presets:") + print(f" {'Name':22} {'Quality':8} {'Compress':8} {'vLLM':5} {'Overhead':8} Description") + for name in QUALITY_ORDER: + p = PRESETS[name] + vllm = "yes" if p["vllm_compatible"] else "no" + print( + f" {name:22} {p['quality']:8} {p['compression_ratio']:.1f}x " + f"{vllm:5} >= {p['min_overhead_gb']:2.0f} GB {p['description']}" + ) + return + + if args.detect_only: + info = SystemInfo.detect() + if args.json: + print(json.dumps({ + "total_gb": round(info.total_gb, 1), + "available_gb": round(info.available_gb, 1), + "gpu_memory_gb": round(info.gpu_memory_gb, 1) if info.gpu_memory_gb else None, + "gpu_name": info.gpu_name, + "is_apple_silicon": info.is_apple_silicon, + "detection_method": info.detection_method, + }, indent=2)) + else: + print(f"Total memory: {info.total_gb:.1f} GB") + print(f"Available: {info.available_gb:.1f} GB") + if info.gpu_memory_gb: + print(f"GPU memory: {info.gpu_memory_gb:.1f} GB") + if info.gpu_name: + print(f"GPU: {info.gpu_name}") + if info.is_apple_silicon: + print(f"Chip: Apple Silicon") + print(f"Detection: {info.detection_method}") + return + + if not args.model_size: + parser.error("--model-size is required (unless using --list or --detect-only)") + + result = auto_select( + model_size_gb=args.model_size, + config_override=args.preset, + require_vllm=args.require_vllm, + config_path=args.config, + ) + + if args.json: + print(json.dumps(result.to_dict(), indent=2)) + elif args.shell: + print(format_env_commands(result)) + else: + print(f"Selected: {result.preset} ({result.quality}, {result.compression_ratio}x)") + print(f" {result.reason}") + if result.system_info: + print(f" System: {result.system_info.total_gb:.0f} GB total, {result.system_info.available_gb:.0f} GB available") + print(f" Overhead: {result.overhead_gb:.1f} GB") + print() + print(f"Environment:") + for k, v in result.env_vars.items(): + print(f" export {k}={v}") + print(f"Server flags:") + for k, v in result.server_flags.items(): + print(f" {k} {v}") + if result.warnings: + print() + for w in result.warnings: + print(f" WARNING: {w}") + + +if __name__ == "__main__": + main()