refactor: consolidate hardware optimizer with quant selector (#92 )

2026-04-20 20:38:56 -04:00
5 changed files with 49 additions and 895 deletions
--- a/evolution/hardware_optimizer.py
+++ b/evolution/hardware_optimizer.py
@@ -1,5 +1,29 @@
-"""Phase 19: Hardware-Aware Inference Optimization.
-Part of the TurboQuant suite for local inference excellence.
+"""Backward-compatible shim for hardware-aware quantization selection.
+
+The original Phase 19 placeholder `hardware_optimizer.py` never shipped real
+logic. The canonical implementation now lives in `evolution.quant_selector`.
+This shim preserves the legacy import path for any downstream callers while
+making `quant_selector.py` the single source of truth.
 """
-import logging
-# ... (rest of the code)
+
+from evolution.quant_selector import (  # noqa: F401
+    HardwareInfo,
+    QuantLevel,
+    QuantSelection,
+    QUANT_LEVELS,
+    detect_hardware,
+    estimate_kv_cache_gb,
+    estimate_model_memory_gb,
+    select_quant_level,
+)
+
+__all__ = [
+    "HardwareInfo",
+    "QuantLevel",
+    "QuantSelection",
+    "QUANT_LEVELS",
+    "detect_hardware",
+    "estimate_kv_cache_gb",
+    "estimate_model_memory_gb",
+    "select_quant_level",
+]
--- a/tests/test_auto_select.py
+++ b/tests/test_auto_select.py
@@ -1,311 +0,0 @@
-#!/usr/bin/env python3
-"""Tests for turboquant/auto_select.py — preset selection logic."""
-
-import json
-import os
-import sys
-import tempfile
-from pathlib import Path
-from unittest.mock import patch, MagicMock
-
-import pytest
-
-sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
-
-from turboquant.auto_select import (
-    select_preset,
-    auto_select,
-    PRESETS,
-    QUALITY_ORDER,
-    SelectionResult,
-    SystemInfo,
-    _load_config,
-    save_config,
-    format_env_commands,
-)
-
-
-# ── Preset Selection ──────────────────────────────────────────────────────────
-
-class TestSelectPreset:
-    """Test preset selection logic."""
-
-    def test_high_overhead_best_quality(self):
-        """8+ GB overhead -> turboquant_k8v4 (best)."""
-        result = select_preset(available_gb=20, model_size_gb=10)
-        assert result.preset == "turboquant_k8v4"
-        assert result.quality == "best"
-        assert result.compression_ratio == 2.6
-        assert result.vllm_compatible is True
-
-    def test_medium_overhead_good_quality(self):
-        """4-8 GB overhead -> turboquant_4bit_nc (good)."""
-        result = select_preset(available_gb=12, model_size_gb=6)
-        assert result.preset == "turboquant_4bit_nc"
-        assert result.quality == "good"
-        assert result.compression_ratio == 3.8
-
-    def test_low_overhead_usable_quality(self):
-        """2-4 GB overhead -> turboquant_3bit_nc (usable)."""
-        result = select_preset(available_gb=8, model_size_gb=5)
-        assert result.preset == "turboquant_3bit_nc"
-        assert result.quality == "usable"
-        assert result.compression_ratio == 4.9
-
-    def test_minimal_overhead_fallback(self):
-        """<2 GB overhead -> q4_0 (basic fallback)."""
-        result = select_preset(available_gb=5, model_size_gb=4)
-        assert result.preset == "q4_0"
-        assert result.quality == "basic"
-        assert result.vllm_compatible is False
-
-    def test_negative_overhead_fallback(self):
-        """Negative overhead (not enough memory) -> q4_0."""
-        result = select_preset(available_gb=3, model_size_gb=10)
-        assert result.preset == "q4_0"
-        assert result.overhead_gb < 0
-        assert len(result.warnings) > 0
-        assert "OVERCOMMITTED" in result.warnings[0]
-
-    def test_exact_threshold(self):
-        """Exactly at threshold selects that preset."""
-        result = select_preset(available_gb=12, model_size_gb=4)  # 8 GB overhead
-        assert result.preset == "turboquant_k8v4"
-
-    def test_just_below_threshold(self):
-        """Just below threshold selects next tier."""
-        result = select_preset(available_gb=11.9, model_size_gb=4)  # 7.9 GB overhead
-        assert result.preset == "turboquant_4bit_nc"
-
-    def test_zero_model_size(self):
-        """Zero model size -> all overhead, best preset."""
-        result = select_preset(available_gb=16, model_size_gb=0)
-        assert result.preset == "turboquant_k8v4"
-
-
-# ── vLLM Requirement ─────────────────────────────────────────────────────────
-
-class TestVLLMRequirement:
-    """Test vLLM compatibility filtering."""
-
-    def test_require_vllm_skips_fallback(self):
-        """require_vllm should skip q4_0 (not vLLM compatible)."""
-        # Very low overhead — without vLLM requirement would pick q4_0
-        result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
-        # q4_0 is not vLLM compatible; with 1 GB overhead nothing else fits
-        # so it falls through to q4_0 as final fallback
-        assert result.preset in QUALITY_ORDER
-
-    def test_require_vllm_high_overhead(self):
-        """With high overhead, vLLM-compatible preset selected."""
-        result = select_preset(available_gb=20, model_size_gb=10, require_vllm=True)
-        assert result.vllm_compatible is True
-
-
-# ── SelectionResult ───────────────────────────────────────────────────────────
-
-class TestSelectionResult:
-    """Test SelectionResult dataclass."""
-
-    def test_to_dict(self):
-        result = select_preset(available_gb=20, model_size_gb=10)
-        d = result.to_dict()
-        assert d["preset"] == "turboquant_k8v4"
-        assert d["quality"] == "best"
-        assert "env_vars" in d
-        assert "server_flags" in d
-        assert "TURBO_LAYER_ADAPTIVE" in d["env_vars"]
-        assert "-ctk" in d["server_flags"]
-
-    def test_env_vars_populated(self):
-        result = select_preset(available_gb=20, model_size_gb=10)
-        assert "TURBO_LAYER_ADAPTIVE" in result.env_vars
-        assert result.env_vars["TURBO_LAYER_ADAPTIVE"] == "7"
-
-    def test_server_flags_populated(self):
-        result = select_preset(available_gb=20, model_size_gb=10)
-        assert "-ctk" in result.server_flags
-        assert "-ctv" in result.server_flags
-
-
-# ── Preset Definitions ────────────────────────────────────────────────────────
-
-class TestPresets:
-    """Test preset definitions."""
-
-    def test_all_presets_have_required_fields(self):
-        for name, preset in PRESETS.items():
-            assert "name" in preset, f"{name} missing 'name'"
-            assert "description" in preset, f"{name} missing 'description'"
-            assert "min_overhead_gb" in preset, f"{name} missing 'min_overhead_gb'"
-            assert "compression_ratio" in preset, f"{name} missing 'compression_ratio'"
-            assert "quality" in preset, f"{name} missing 'quality'"
-            assert "vllm_compatible" in preset, f"{name} missing 'vllm_compatible'"
-            assert "kv_type" in preset, f"{name} missing 'kv_type'"
-            assert "layer_adaptive" in preset, f"{name} missing 'layer_adaptive'"
-
-    def test_quality_order_matches_presets(self):
-        for name in QUALITY_ORDER:
-            assert name in PRESETS, f"{name} in QUALITY_ORDER but not in PRESETS"
-
-    def test_all_presets_in_quality_order(self):
-        for name in PRESETS:
-            assert name in QUALITY_ORDER, f"{name} in PRESETS but not in QUALITY_ORDER"
-
-    def test_compression_increases_with_aggressiveness(self):
-        """More aggressive presets should have higher compression."""
-        for i in range(len(QUALITY_ORDER) - 1):
-            current = PRESETS[QUALITY_ORDER[i]]
-            next_p = PRESETS[QUALITY_ORDER[i + 1]]
-            # TurboQuant presets should increase in compression
-            # q4_0 is a fallback and may break the pattern
-            if QUALITY_ORDER[i + 1] != "q4_0":
-                assert current["compression_ratio"] <= next_p["compression_ratio"], (
-                    f"{QUALITY_ORDER[i]} ({current['compression_ratio']}x) should have "
-                    f"<= compression than {QUALITY_ORDER[i+1]} ({next_p['compression_ratio']}x)"
-                )
-
-    def test_overhead_decreases_with_aggressiveness(self):
-        """More aggressive presets should have lower overhead requirements."""
-        for i in range(len(QUALITY_ORDER) - 1):
-            current = PRESETS[QUALITY_ORDER[i]]
-            next_p = PRESETS[QUALITY_ORDER[i + 1]]
-            assert current["min_overhead_gb"] >= next_p["min_overhead_gb"], (
-                f"{QUALITY_ORDER[i]} (overhead={current['min_overhead_gb']}GB) should have "
-                f">= overhead than {QUALITY_ORDER[i+1]} (overhead={next_p['min_overhead_gb']}GB)"
-            )
-
-
-# ── SystemInfo ────────────────────────────────────────────────────────────────
-
-class TestSystemInfo:
-    """Test system detection."""
-
-    def test_detect_returns_info(self):
-        info = SystemInfo.detect()
-        assert info.total_gb > 0
-        assert info.available_gb > 0
-        assert info.detection_method
-
-    def test_available_less_than_total(self):
-        info = SystemInfo.detect()
-        assert info.available_gb <= info.total_gb
-
-
-# ── Config ────────────────────────────────────────────────────────────────────
-
-class TestConfig:
-    """Test config loading and saving."""
-
-    def test_load_config_missing_file(self, tmp_path):
-        config = _load_config(str(tmp_path / "nonexistent.json"))
-        assert config == {}
-
-    def test_load_config_valid_file(self, tmp_path):
-        config_file = tmp_path / "turboquant.json"
-        config_file.write_text(json.dumps({"preset_override": "turboquant_4bit_nc"}))
-        config = _load_config(str(config_file))
-        assert config["preset_override"] == "turboquant_4bit_nc"
-
-    def test_save_and_load_config(self, tmp_path):
-        config_file = tmp_path / "turboquant.json"
-        save_config({"preset_override": "turboquant_k8v4", "context_length": 32768},
-                     str(config_file))
-        config = _load_config(str(config_file))
-        assert config["preset_override"] == "turboquant_k8v4"
-        assert config["context_length"] == 32768
-
-    def test_save_config_merges(self, tmp_path):
-        config_file = tmp_path / "turboquant.json"
-        config_file.write_text(json.dumps({"existing_key": "value"}))
-        save_config({"new_key": "new_value"}, str(config_file))
-        config = _load_config(str(config_file))
-        assert config["existing_key"] == "value"
-        assert config["new_key"] == "new_value"
-
-
-# ── Auto-Select ───────────────────────────────────────────────────────────────
-
-class TestAutoSelect:
-    """Test auto_select with mocked system detection."""
-
-    def test_auto_select_with_override(self, tmp_path):
-        config_file = tmp_path / "turboquant.json"
-        result = auto_select(model_size_gb=14.0, config_override="turboquant_4bit_nc")
-        assert result.preset == "turboquant_4bit_nc"
-        assert "Config override" in result.reason
-
-    def test_auto_select_unknown_override_ignored(self):
-        result = auto_select(model_size_gb=14.0, config_override="nonexistent_preset")
-        # Should fall back to normal detection
-        assert result.preset in QUALITY_ORDER
-
-    def test_auto_select_config_override(self, tmp_path):
-        """Config file preset_override is respected."""
-        config_file = tmp_path / "turboquant.json"
-        config_file.write_text(json.dumps({"preset_override": "turboquant_3bit_nc"}))
-        result = auto_select(model_size_gb=14.0, config_path=str(config_file))
-        assert result.preset == "turboquant_3bit_nc"
-
-    def test_auto_select_detects_system(self):
-        with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
-            mock_detect.return_value = SystemInfo(
-                total_gb=32, available_gb=24, is_apple_silicon=True,
-                detection_method="mock"
-            )
-            result = auto_select(model_size_gb=14.0)
-            assert result.system_info is not None
-            assert result.system_info.total_gb == 32
-            assert result.preset == "turboquant_k8v4"  # 24 - 14 = 10 GB overhead
-
-    def test_auto_select_tight_memory(self):
-        with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
-            mock_detect.return_value = SystemInfo(
-                total_gb=16, available_gb=12, detection_method="mock"
-            )
-            result = auto_select(model_size_gb=10.0)
-            # 12 - 10 = 2 GB overhead -> turboquant_3bit_nc (min 2 GB)
-            assert result.preset == "turboquant_3bit_nc"
-
-
-# ── Format ────────────────────────────────────────────────────────────────────
-
-class TestFormat:
-    """Test formatting utilities."""
-
-    def test_format_env_commands(self):
-        result = select_preset(available_gb=20, model_size_gb=10)
-        output = format_env_commands(result)
-        assert "export TURBO_LAYER_ADAPTIVE=7" in output
-        assert "turboquant_k8v4" in output
-
-    def test_format_env_commands_with_warnings(self):
-        result = select_preset(available_gb=5, model_size_gb=4)
-        output = format_env_commands(result)
-        assert "WARNING" in output
-
-
-# ── Integration with issue spec ───────────────────────────────────────────────
-
-class TestIssueSpec:
-    """Verify implementation matches the exact logic from issue #97."""
-
-    def test_overhead_8gb_plus(self):
-        """overhead >= 8 -> turboquant_k8v4"""
-        result = select_preset(available_gb=15, model_size_gb=7)  # 8 GB overhead
-        assert result.preset == "turboquant_k8v4"
-
-    def test_overhead_4_to_8(self):
-        """4 <= overhead < 8 -> turboquant_4bit_nc"""
-        result = select_preset(available_gb=11, model_size_gb=7)  # 4 GB overhead
-        assert result.preset == "turboquant_4bit_nc"
-
-    def test_overhead_2_to_4(self):
-        """2 <= overhead < 4 -> turboquant_3bit_nc"""
-        result = select_preset(available_gb=9, model_size_gb=7)  # 2 GB overhead
-        assert result.preset == "turboquant_3bit_nc"
-
-    def test_overhead_under_2(self):
-        """overhead < 2 -> q4_0"""
-        result = select_preset(available_gb=8, model_size_gb=7)  # 1 GB overhead
-        assert result.preset == "q4_0"
--- a/tests/test_hardware_optimizer.py
+++ b/tests/test_hardware_optimizer.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+"""Tests for hardware_optimizer compatibility shim."""
+
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+
+from evolution import hardware_optimizer, quant_selector
+
+
+def test_hardware_optimizer_reexports_quant_selector_api():
+    assert hardware_optimizer.select_quant_level is quant_selector.select_quant_level
+    assert hardware_optimizer.detect_hardware is quant_selector.detect_hardware
+    assert hardware_optimizer.HardwareInfo is quant_selector.HardwareInfo
+    assert hardware_optimizer.QuantSelection is quant_selector.QuantSelection
+
+
+def test_hardware_optimizer_exports_quant_level_definitions():
+    assert hardware_optimizer.QUANT_LEVELS is quant_selector.QUANT_LEVELS
+    assert hardware_optimizer.QuantLevel is quant_selector.QuantLevel
--- a/turboquant/init.py
+++ b/turboquant/init.py
@@ -1,23 +0,0 @@
-"""TurboQuant — Auto-configuration and preset selection for KV cache compression."""
-
-from turboquant.auto_select import (
-    auto_select,
-    select_preset,
-    list_presets,
-    get_preset_info,
-    SystemInfo,
-    SelectionResult,
-    PRESETS,
-    QUALITY_ORDER,
-)
-
-__all__ = [
-    "auto_select",
-    "select_preset",
-    "list_presets",
-    "get_preset_info",
-    "SystemInfo",
-    "SelectionResult",
-    "PRESETS",
-    "QUALITY_ORDER",
-]
--- a/turboquant/auto_select.py
+++ b/turboquant/auto_select.py
@@ -1,557 +0,0 @@
-#!/usr/bin/env python3
-"""
-TurboQuant Auto-Select — Choose optimal KV cache preset based on available memory.
-
-Implements the config helper from issue #97: detects available memory and
-selects the best TurboQuant preset for the deployment environment.
-
-Presets map to deployment configurations (not raw quant levels):
-  turboquant_k8v4   — 8+ GB overhead, best quality (k/v asymmetric: 8-bit K, 4-bit V)
-  turboquant_4bit_nc — 4-8 GB overhead, good quality (4-bit, no calibration)
-  turboquant_3bit_nc — 2-4 GB overhead, usable quality (3-bit, no calibration)
-  q4_0              — <2 GB overhead, GGUF fallback (no vLLM)
-
-Usage:
-    from turboquant.auto_select import auto_select
-
-    result = auto_select(model_size_gb=7.0)
-    print(f"Selected: {result.preset}")
-
-CLI:
-    python3 turboquant/auto_select.py --model-size 7.0
-    python3 turboquant/auto_select.py --model-size 7.0 --json
-    python3 turboquant/auto_select.py --list
-"""
-
-import json
-import logging
-import os
-import platform
-import subprocess
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-
-# ── Preset Definitions ────────────────────────────────────────────────────────
-
-PRESETS: Dict[str, Dict[str, Any]] = {
-    "turboquant_k8v4": {
-        "name": "TurboQuant K8V4",
-        "description": "Best quality — asymmetric 8-bit K, 4-bit V",
-        "min_overhead_gb": 8,
-        "compression_ratio": 2.6,
-        "quality": "best",
-        "vllm_compatible": True,
-        "kv_type": "turbo4",
-        "layer_adaptive": 7,
-    },
-    "turboquant_4bit_nc": {
-        "name": "TurboQuant 4-bit NC",
-        "description": "Good quality — 4-bit, no calibration",
-        "min_overhead_gb": 4,
-        "compression_ratio": 3.8,
-        "quality": "good",
-        "vllm_compatible": True,
-        "kv_type": "turbo4",
-        "layer_adaptive": 5,
-    },
-    "turboquant_3bit_nc": {
-        "name": "TurboQuant 3-bit NC",
-        "description": "Usable quality — 3-bit, no calibration",
-        "min_overhead_gb": 2,
-        "compression_ratio": 4.9,
-        "quality": "usable",
-        "vllm_compatible": True,
-        "kv_type": "turbo3",
-        "layer_adaptive": 3,
-    },
-    "q4_0": {
-        "name": "Q4_0 GGUF",
-        "description": "GGUF fallback — no TurboQuant, no vLLM",
-        "min_overhead_gb": 0,
-        "compression_ratio": 4.0,
-        "quality": "basic",
-        "vllm_compatible": False,
-        "kv_type": "q4_0",
-        "layer_adaptive": 0,
-    },
-}
-
-# Ordered from best quality to most aggressive
-QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
-
-
-# ── Data Classes ──────────────────────────────────────────────────────────────
-
-@dataclass
-class SystemInfo:
-    """Detected system resources."""
-    total_gb: float
-    available_gb: float
-    gpu_memory_gb: Optional[float] = None
-    gpu_name: Optional[str] = None
-    is_apple_silicon: bool = False
-    detection_method: str = ""
-
-    @classmethod
-    def detect(cls) -> "SystemInfo":
-        """Detect available memory. Uses platform-appropriate methods."""
-        system = platform.system()
-
-        if system == "Darwin":
-            return cls._detect_macos()
-        elif system == "Linux":
-            return cls._detect_linux()
-        else:
-            return cls._detect_fallback(system)
-
-    @classmethod
-    def _detect_macos(cls) -> "SystemInfo":
-        """Detect Apple Silicon unified memory."""
-        info = cls(total_gb=0, available_gb=0, is_apple_silicon=True, detection_method="sysctl")
-        try:
-            r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
-            if r.returncode == 0:
-                info.total_gb = int(r.stdout.strip()) / (1024 ** 3)
-
-            r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
-            if r.returncode == 0:
-                page_size = 4096
-                free_pages = 0
-                for line in r.stdout.split("\n"):
-                    if "Pages free:" in line:
-                        try:
-                            free_pages = int(line.split(":")[-1].strip().rstrip("."))
-                        except ValueError:
-                            pass
-                info.available_gb = (free_pages * page_size) / (1024 ** 3)
-
-            if info.available_gb < 1:
-                info.available_gb = info.total_gb * 0.70
-
-            info.gpu_memory_gb = info.total_gb  # Unified memory
-
-            r = subprocess.run(
-                ["system_profiler", "SPDisplaysDataType"],
-                capture_output=True, text=True, timeout=10
-            )
-            if r.returncode == 0:
-                for line in r.stdout.split("\n"):
-                    if "Chipset" in line or "GPU" in line:
-                        info.gpu_name = line.split(":")[-1].strip()
-                        break
-        except Exception as e:
-            logger.warning("macOS detection failed: %s", e)
-            info.total_gb = 16.0
-            info.available_gb = 12.0
-            info.detection_method = "fallback"
-
-        return info
-
-    @classmethod
-    def _detect_linux(cls) -> "SystemInfo":
-        """Detect Linux with optional NVIDIA GPU."""
-        info = cls(total_gb=0, available_gb=0, detection_method="proc")
-        try:
-            with open("/proc/meminfo") as f:
-                for line in f:
-                    if line.startswith("MemTotal:"):
-                        info.total_gb = int(line.split()[1]) / (1024 * 1024)
-                    elif line.startswith("MemAvailable:"):
-                        info.available_gb = int(line.split()[1]) / (1024 * 1024)
-
-            try:
-                r = subprocess.run(
-                    ["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
-                     "--format=csv,noheader,nounits"],
-                    capture_output=True, text=True, timeout=10
-                )
-                if r.returncode == 0 and r.stdout.strip():
-                    parts = r.stdout.strip().split("\n")[0].split(", ")
-                    if len(parts) >= 3:
-                        info.gpu_name = parts[0].strip()
-                        info.gpu_memory_gb = float(parts[1]) / 1024
-                        info.available_gb = max(info.available_gb, float(parts[2]) / 1024)
-                        info.detection_method = "nvidia-smi"
-            except (FileNotFoundError, subprocess.TimeoutExpired):
-                pass
-        except Exception as e:
-            logger.warning("Linux detection failed: %s", e)
-            info.total_gb = 16.0
-            info.available_gb = 12.0
-            info.detection_method = "fallback"
-
-        return info
-
-    @classmethod
-    def _detect_fallback(cls, system: str) -> "SystemInfo":
-        """Fallback for unknown systems. Tries psutil."""
-        try:
-            import psutil
-            mem = psutil.virtual_memory()
-            return cls(
-                total_gb=round(mem.total / (1024 ** 3), 1),
-                available_gb=round(mem.available / (1024 ** 3), 1),
-                detection_method="psutil",
-            )
-        except ImportError:
-            return cls(total_gb=16.0, available_gb=12.0, detection_method="fallback")
-
-
-@dataclass
-class SelectionResult:
-    """Result of preset selection."""
-    preset: str
-    reason: str
-    overhead_gb: float
-    quality: str
-    compression_ratio: float
-    vllm_compatible: bool
-    kv_type: str
-    layer_adaptive: int
-    env_vars: Dict[str, str] = field(default_factory=dict)
-    server_flags: Dict[str, str] = field(default_factory=dict)
-    warnings: list = field(default_factory=list)
-    system_info: Optional[SystemInfo] = None
-
-    def to_dict(self) -> dict:
-        d = {
-            "preset": self.preset,
-            "reason": self.reason,
-            "overhead_gb": round(self.overhead_gb, 2),
-            "quality": self.quality,
-            "compression_ratio": self.compression_ratio,
-            "vllm_compatible": self.vllm_compatible,
-            "kv_type": self.kv_type,
-            "layer_adaptive": self.layer_adaptive,
-            "env_vars": self.env_vars,
-            "server_flags": self.server_flags,
-            "warnings": self.warnings,
-        }
-        if self.system_info:
-            d["system"] = {
-                "total_gb": round(self.system_info.total_gb, 1),
-                "available_gb": round(self.system_info.available_gb, 1),
-                "gpu_name": self.system_info.gpu_name,
-                "is_apple_silicon": self.system_info.is_apple_silicon,
-                "detection_method": self.system_info.detection_method,
-            }
-        return d
-
-
-# ── Selection Logic ───────────────────────────────────────────────────────────
-
-def select_preset(
-    available_gb: float,
-    model_size_gb: float,
-    require_vllm: bool = False,
-) -> SelectionResult:
-    """Select the best TurboQuant preset based on memory overhead.
-
-    Args:
-        available_gb: Available system memory in GB
-        model_size_gb: Model size in GB
-        require_vllm: If True, only select vLLM-compatible presets
-
-    Returns:
-        SelectionResult with chosen preset and reasoning
-    """
-    overhead_gb = available_gb - model_size_gb
-
-    if overhead_gb < 0:
-        logger.warning(
-            "Insufficient memory: need %.1f GB, have %.1f GB available",
-            model_size_gb, available_gb
-        )
-        return _make_result("q4_0", overhead_gb,
-                            reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback")
-
-    # Walk quality order (best first), pick first that fits
-    for preset_name in QUALITY_ORDER:
-        preset = PRESETS[preset_name]
-
-        if require_vllm and not preset["vllm_compatible"]:
-            continue
-
-        if overhead_gb >= preset["min_overhead_gb"]:
-            reason = (
-                f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB "
-                f"required for {preset['name']}"
-            )
-            return _make_result(preset_name, overhead_gb, reason=reason)
-
-    # Nothing fits — aggressive fallback
-    return _make_result("q4_0", overhead_gb,
-                        reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback")
-
-
-def _make_result(preset_name: str, overhead_gb: float, reason: str) -> SelectionResult:
-    """Build a SelectionResult from a preset name."""
-    p = PRESETS[preset_name]
-
-    env_vars = {"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])}
-    server_flags = {"-ctk": p["kv_type"], "-ctv": p["kv_type"]}
-
-    warnings = []
-    if overhead_gb < 2.0 and overhead_gb >= 0:
-        warnings.append(f"Low headroom ({overhead_gb:.1f} GB). Consider reducing context length.")
-    if overhead_gb < 0:
-        warnings.append(
-            f"OVERCOMMITTED: model needs {abs(overhead_gb):.1f} GB more than available. "
-            f"Inference may fail or swap heavily."
-        )
-
-    return SelectionResult(
-        preset=preset_name,
-        reason=reason,
-        overhead_gb=overhead_gb,
-        quality=p["quality"],
-        compression_ratio=p["compression_ratio"],
-        vllm_compatible=p["vllm_compatible"],
-        kv_type=p["kv_type"],
-        layer_adaptive=p["layer_adaptive"],
-        env_vars=env_vars,
-        server_flags=server_flags,
-        warnings=warnings,
-    )
-
-
-# ── Auto-Select (with system detection + config) ─────────────────────────────
-
-def auto_select(
-    model_size_gb: float,
-    config_override: Optional[str] = None,
-    require_vllm: bool = False,
-    config_path: Optional[str] = None,
-) -> SelectionResult:
-    """Auto-select preset based on system detection.
-
-    Args:
-        model_size_gb: Model size in GB
-        config_override: Force a specific preset (skips detection)
-        require_vllm: Require vLLM compatibility
-        config_path: Path to config file (default: $HERMES_HOME/turboquant.json)
-
-    Returns:
-        SelectionResult
-    """
-    # Load config file if exists
-    config = _load_config(config_path)
-
-    # Config override takes precedence
-    override = config_override or config.get("preset_override")
-    if override:
-        if override in PRESETS:
-            logger.info("Using config override: %s", override)
-            p = PRESETS[override]
-            return SelectionResult(
-                preset=override,
-                reason=f"Config override: {p['name']}",
-                overhead_gb=0,
-                quality=p["quality"],
-                compression_ratio=p["compression_ratio"],
-                vllm_compatible=p["vllm_compatible"],
-                kv_type=p["kv_type"],
-                layer_adaptive=p["layer_adaptive"],
-                env_vars={"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])},
-                server_flags={"-ctk": p["kv_type"], "-ctv": p["kv_type"]},
-            )
-        else:
-            logger.warning("Unknown preset override '%s', falling back to auto-select", override)
-
-    # Detect system
-    sys_info = SystemInfo.detect()
-    logger.info(
-        "System: %.1f GB total, %.1f GB available, model: %.1f GB",
-        sys_info.total_gb, sys_info.available_gb, model_size_gb
-    )
-
-    # Select
-    result = select_preset(
-        available_gb=sys_info.available_gb,
-        model_size_gb=model_size_gb,
-        require_vllm=require_vllm,
-    )
-    result.system_info = sys_info
-
-    # Apply context length from config
-    context_length = config.get("context_length")
-    if context_length:
-        result.server_flags["-c"] = str(context_length)
-
-    return result
-
-
-def _load_config(config_path: Optional[str] = None) -> dict:
-    """Load turboquant config from JSON file."""
-    if config_path:
-        p = Path(config_path)
-    else:
-        try:
-            from hermes_constants import get_hermes_home
-            p = get_hermes_home() / "turboquant.json"
-        except (ImportError, Exception):
-            p = Path.home() / ".hermes" / "turboquant.json"
-
-    if p.exists():
-        try:
-            return json.loads(p.read_text(encoding="utf-8"))
-        except (json.JSONDecodeError, OSError):
-            pass
-    return {}
-
-
-def save_config(values: dict, config_path: Optional[str] = None) -> None:
-    """Save turboquant config to JSON file."""
-    if config_path:
-        p = Path(config_path)
-    else:
-        try:
-            from hermes_constants import get_hermes_home
-            p = get_hermes_home() / "turboquant.json"
-        except (ImportError, Exception):
-            p = Path.home() / ".hermes" / "turboquant.json"
-
-    p.parent.mkdir(parents=True, exist_ok=True)
-
-    existing = {}
-    if p.exists():
-        try:
-            existing = json.loads(p.read_text(encoding="utf-8"))
-        except (json.JSONDecodeError, OSError):
-            pass
-
-    existing.update(values)
-    p.write_text(json.dumps(existing, indent=2), encoding="utf-8")
-    logger.info("Config saved to %s", p)
-
-
-# ── Utility ───────────────────────────────────────────────────────────────────
-
-def get_preset_info(preset_name: str) -> Optional[dict]:
-    """Get information about a preset."""
-    return PRESETS.get(preset_name)
-
-
-def list_presets() -> dict:
-    """List all available presets."""
-    return PRESETS.copy()
-
-
-def format_env_commands(result: SelectionResult) -> str:
-    """Format result as shell export commands."""
-    lines = [f"# TurboQuant preset: {result.preset} ({result.quality}, {result.compression_ratio}x)"]
-    lines.append(f"# {result.reason}")
-    for k, v in result.env_vars.items():
-        lines.append(f"export {k}={v}")
-    flags = " ".join(f"{k} {v}" for k, v in result.server_flags.items())
-    if flags:
-        lines.append(f"# llama-server flags: {flags}")
-    for w in result.warnings:
-        lines.append(f"# WARNING: {w}")
-    return "\n".join(lines)
-
-
-# ── CLI ───────────────────────────────────────────────────────────────────────
-
-def main():
-    """CLI entry point."""
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Auto-select TurboQuant preset based on available memory",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  %(prog)s --model-size 14.0           # Auto-detect and select
-  %(prog)s --model-size 7.0 --json     # JSON output
-  %(prog)s --model-size 7.0 --list     # List available presets
-  %(prog)s --model-size 7.0 --require-vllm
-  %(prog)s --model-size 7.0 --preset turboquant_4bit_nc  # Override
-        """,
-    )
-    parser.add_argument("--model-size", type=float, help="Model size in GB")
-    parser.add_argument("--preset", help="Force a specific preset (skip detection)")
-    parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
-    parser.add_argument("--json", action="store_true", help="JSON output")
-    parser.add_argument("--list", action="store_true", help="List available presets")
-    parser.add_argument("--shell", action="store_true", help="Output as shell export commands")
-    parser.add_argument("--detect-only", action="store_true", help="Only detect hardware")
-    parser.add_argument("--config", help="Path to config file")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO, format="%(message)s")
-
-    if args.list:
-        print("Available presets:")
-        print(f"  {'Name':22} {'Quality':8} {'Compress':8} {'vLLM':5} {'Overhead':8} Description")
-        for name in QUALITY_ORDER:
-            p = PRESETS[name]
-            vllm = "yes" if p["vllm_compatible"] else "no"
-            print(
-                f"  {name:22} {p['quality']:8} {p['compression_ratio']:.1f}x     "
-                f"{vllm:5} >= {p['min_overhead_gb']:2.0f} GB   {p['description']}"
-            )
-        return
-
-    if args.detect_only:
-        info = SystemInfo.detect()
-        if args.json:
-            print(json.dumps({
-                "total_gb": round(info.total_gb, 1),
-                "available_gb": round(info.available_gb, 1),
-                "gpu_memory_gb": round(info.gpu_memory_gb, 1) if info.gpu_memory_gb else None,
-                "gpu_name": info.gpu_name,
-                "is_apple_silicon": info.is_apple_silicon,
-                "detection_method": info.detection_method,
-            }, indent=2))
-        else:
-            print(f"Total memory:    {info.total_gb:.1f} GB")
-            print(f"Available:       {info.available_gb:.1f} GB")
-            if info.gpu_memory_gb:
-                print(f"GPU memory:      {info.gpu_memory_gb:.1f} GB")
-            if info.gpu_name:
-                print(f"GPU:             {info.gpu_name}")
-            if info.is_apple_silicon:
-                print(f"Chip:            Apple Silicon")
-            print(f"Detection:       {info.detection_method}")
-        return
-
-    if not args.model_size:
-        parser.error("--model-size is required (unless using --list or --detect-only)")
-
-    result = auto_select(
-        model_size_gb=args.model_size,
-        config_override=args.preset,
-        require_vllm=args.require_vllm,
-        config_path=args.config,
-    )
-
-    if args.json:
-        print(json.dumps(result.to_dict(), indent=2))
-    elif args.shell:
-        print(format_env_commands(result))
-    else:
-        print(f"Selected: {result.preset} ({result.quality}, {result.compression_ratio}x)")
-        print(f"  {result.reason}")
-        if result.system_info:
-            print(f"  System: {result.system_info.total_gb:.0f} GB total, {result.system_info.available_gb:.0f} GB available")
-        print(f"  Overhead: {result.overhead_gb:.1f} GB")
-        print()
-        print(f"Environment:")
-        for k, v in result.env_vars.items():
-            print(f"  export {k}={v}")
-        print(f"Server flags:")
-        for k, v in result.server_flags.items():
-            print(f"  {k} {v}")
-        if result.warnings:
-            print()
-            for w in result.warnings:
-                print(f"  WARNING: {w}")
-
-
-if __name__ == "__main__":
-    main()