diff --git a/tests/test_auto_select.py b/tests/test_auto_select.py
new file mode 100644
index 00000000..ef361ce4
--- /dev/null
+++ b/tests/test_auto_select.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""Tests for turboquant/auto_select.py — preset selection logic."""
+
+import json
+import os
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+
+from turboquant.auto_select import (
+    select_preset,
+    auto_select,
+    PRESETS,
+    QUALITY_ORDER,
+    SelectionResult,
+    SystemInfo,
+    _load_config,
+    save_config,
+    format_env_commands,
+)
+
+
+# ── Preset Selection ──────────────────────────────────────────────────────────
+
+class TestSelectPreset:
+    """Test preset selection logic."""
+
+    def test_high_overhead_best_quality(self):
+        """8+ GB overhead -> turboquant_k8v4 (best)."""
+        result = select_preset(available_gb=20, model_size_gb=10)
+        assert result.preset == "turboquant_k8v4"
+        assert result.quality == "best"
+        assert result.compression_ratio == 2.6
+        assert result.vllm_compatible is True
+
+    def test_medium_overhead_good_quality(self):
+        """4-8 GB overhead -> turboquant_4bit_nc (good)."""
+        result = select_preset(available_gb=12, model_size_gb=6)
+        assert result.preset == "turboquant_4bit_nc"
+        assert result.quality == "good"
+        assert result.compression_ratio == 3.8
+
+    def test_low_overhead_usable_quality(self):
+        """2-4 GB overhead -> turboquant_3bit_nc (usable)."""
+        result = select_preset(available_gb=8, model_size_gb=5)
+        assert result.preset == "turboquant_3bit_nc"
+        assert result.quality == "usable"
+        assert result.compression_ratio == 4.9
+
+    def test_minimal_overhead_fallback(self):
+        """<2 GB overhead -> q4_0 (basic fallback)."""
+        result = select_preset(available_gb=5, model_size_gb=4)
+        assert result.preset == "q4_0"
+        assert result.quality == "basic"
+        assert result.vllm_compatible is False
+
+    def test_negative_overhead_fallback(self):
+        """Negative overhead (not enough memory) -> q4_0."""
+        result = select_preset(available_gb=3, model_size_gb=10)
+        assert result.preset == "q4_0"
+        assert result.overhead_gb < 0
+        assert len(result.warnings) > 0
+        assert "OVERCOMMITTED" in result.warnings[0]
+
+    def test_exact_threshold(self):
+        """Exactly at threshold selects that preset."""
+        result = select_preset(available_gb=12, model_size_gb=4)  # 8 GB overhead
+        assert result.preset == "turboquant_k8v4"
+
+    def test_just_below_threshold(self):
+        """Just below threshold selects next tier."""
+        result = select_preset(available_gb=11.9, model_size_gb=4)  # 7.9 GB overhead
+        assert result.preset == "turboquant_4bit_nc"
+
+    def test_zero_model_size(self):
+        """Zero model size -> all overhead, best preset."""
+        result = select_preset(available_gb=16, model_size_gb=0)
+        assert result.preset == "turboquant_k8v4"
+
+
+# ── vLLM Requirement ─────────────────────────────────────────────────────────
+
+class TestVLLMRequirement:
+    """Test vLLM compatibility filtering."""
+
+    def test_require_vllm_skips_fallback(self):
+        """require_vllm should skip q4_0 (not vLLM compatible)."""
+        # Very low overhead — without vLLM requirement would pick q4_0
+        result = select_preset(available_gb=5, model_size_gb=4, require_vllm=True)
+        # q4_0 is not vLLM compatible; with 1 GB overhead nothing else fits
+        # so it falls through to q4_0 as final fallback
+        assert result.preset in QUALITY_ORDER
+
+    def test_require_vllm_high_overhead(self):
+        """With high overhead, vLLM-compatible preset selected."""
+        result = select_preset(available_gb=20, model_size_gb=10, require_vllm=True)
+        assert result.vllm_compatible is True
+
+
+# ── SelectionResult ───────────────────────────────────────────────────────────
+
+class TestSelectionResult:
+    """Test SelectionResult dataclass."""
+
+    def test_to_dict(self):
+        result = select_preset(available_gb=20, model_size_gb=10)
+        d = result.to_dict()
+        assert d["preset"] == "turboquant_k8v4"
+        assert d["quality"] == "best"
+        assert "env_vars" in d
+        assert "server_flags" in d
+        assert "TURBO_LAYER_ADAPTIVE" in d["env_vars"]
+        assert "-ctk" in d["server_flags"]
+
+    def test_env_vars_populated(self):
+        result = select_preset(available_gb=20, model_size_gb=10)
+        assert "TURBO_LAYER_ADAPTIVE" in result.env_vars
+        assert result.env_vars["TURBO_LAYER_ADAPTIVE"] == "7"
+
+    def test_server_flags_populated(self):
+        result = select_preset(available_gb=20, model_size_gb=10)
+        assert "-ctk" in result.server_flags
+        assert "-ctv" in result.server_flags
+
+
+# ── Preset Definitions ────────────────────────────────────────────────────────
+
+class TestPresets:
+    """Test preset definitions."""
+
+    def test_all_presets_have_required_fields(self):
+        for name, preset in PRESETS.items():
+            assert "name" in preset, f"{name} missing 'name'"
+            assert "description" in preset, f"{name} missing 'description'"
+            assert "min_overhead_gb" in preset, f"{name} missing 'min_overhead_gb'"
+            assert "compression_ratio" in preset, f"{name} missing 'compression_ratio'"
+            assert "quality" in preset, f"{name} missing 'quality'"
+            assert "vllm_compatible" in preset, f"{name} missing 'vllm_compatible'"
+            assert "kv_type" in preset, f"{name} missing 'kv_type'"
+            assert "layer_adaptive" in preset, f"{name} missing 'layer_adaptive'"
+
+    def test_quality_order_matches_presets(self):
+        for name in QUALITY_ORDER:
+            assert name in PRESETS, f"{name} in QUALITY_ORDER but not in PRESETS"
+
+    def test_all_presets_in_quality_order(self):
+        for name in PRESETS:
+            assert name in QUALITY_ORDER, f"{name} in PRESETS but not in QUALITY_ORDER"
+
+    def test_compression_increases_with_aggressiveness(self):
+        """More aggressive presets should have higher compression."""
+        for i in range(len(QUALITY_ORDER) - 1):
+            current = PRESETS[QUALITY_ORDER[i]]
+            next_p = PRESETS[QUALITY_ORDER[i + 1]]
+            # TurboQuant presets should increase in compression
+            # q4_0 is a fallback and may break the pattern
+            if QUALITY_ORDER[i + 1] != "q4_0":
+                assert current["compression_ratio"] <= next_p["compression_ratio"], (
+                    f"{QUALITY_ORDER[i]} ({current['compression_ratio']}x) should have "
+                    f"<= compression than {QUALITY_ORDER[i+1]} ({next_p['compression_ratio']}x)"
+                )
+
+    def test_overhead_decreases_with_aggressiveness(self):
+        """More aggressive presets should have lower overhead requirements."""
+        for i in range(len(QUALITY_ORDER) - 1):
+            current = PRESETS[QUALITY_ORDER[i]]
+            next_p = PRESETS[QUALITY_ORDER[i + 1]]
+            assert current["min_overhead_gb"] >= next_p["min_overhead_gb"], (
+                f"{QUALITY_ORDER[i]} (overhead={current['min_overhead_gb']}GB) should have "
+                f">= overhead than {QUALITY_ORDER[i+1]} (overhead={next_p['min_overhead_gb']}GB)"
+            )
+
+
+# ── SystemInfo ────────────────────────────────────────────────────────────────
+
+class TestSystemInfo:
+    """Test system detection."""
+
+    def test_detect_returns_info(self):
+        info = SystemInfo.detect()
+        assert info.total_gb > 0
+        assert info.available_gb > 0
+        assert info.detection_method
+
+    def test_available_less_than_total(self):
+        info = SystemInfo.detect()
+        assert info.available_gb <= info.total_gb
+
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+class TestConfig:
+    """Test config loading and saving."""
+
+    def test_load_config_missing_file(self, tmp_path):
+        config = _load_config(str(tmp_path / "nonexistent.json"))
+        assert config == {}
+
+    def test_load_config_valid_file(self, tmp_path):
+        config_file = tmp_path / "turboquant.json"
+        config_file.write_text(json.dumps({"preset_override": "turboquant_4bit_nc"}))
+        config = _load_config(str(config_file))
+        assert config["preset_override"] == "turboquant_4bit_nc"
+
+    def test_save_and_load_config(self, tmp_path):
+        config_file = tmp_path / "turboquant.json"
+        save_config({"preset_override": "turboquant_k8v4", "context_length": 32768},
+                     str(config_file))
+        config = _load_config(str(config_file))
+        assert config["preset_override"] == "turboquant_k8v4"
+        assert config["context_length"] == 32768
+
+    def test_save_config_merges(self, tmp_path):
+        config_file = tmp_path / "turboquant.json"
+        config_file.write_text(json.dumps({"existing_key": "value"}))
+        save_config({"new_key": "new_value"}, str(config_file))
+        config = _load_config(str(config_file))
+        assert config["existing_key"] == "value"
+        assert config["new_key"] == "new_value"
+
+
+# ── Auto-Select ───────────────────────────────────────────────────────────────
+
+class TestAutoSelect:
+    """Test auto_select with mocked system detection."""
+
+    def test_auto_select_with_override(self, tmp_path):
+        config_file = tmp_path / "turboquant.json"
+        result = auto_select(model_size_gb=14.0, config_override="turboquant_4bit_nc")
+        assert result.preset == "turboquant_4bit_nc"
+        assert "Config override" in result.reason
+
+    def test_auto_select_unknown_override_ignored(self):
+        result = auto_select(model_size_gb=14.0, config_override="nonexistent_preset")
+        # Should fall back to normal detection
+        assert result.preset in QUALITY_ORDER
+
+    def test_auto_select_config_override(self, tmp_path):
+        """Config file preset_override is respected."""
+        config_file = tmp_path / "turboquant.json"
+        config_file.write_text(json.dumps({"preset_override": "turboquant_3bit_nc"}))
+        result = auto_select(model_size_gb=14.0, config_path=str(config_file))
+        assert result.preset == "turboquant_3bit_nc"
+
+    def test_auto_select_detects_system(self):
+        with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
+            mock_detect.return_value = SystemInfo(
+                total_gb=32, available_gb=24, is_apple_silicon=True,
+                detection_method="mock"
+            )
+            result = auto_select(model_size_gb=14.0)
+            assert result.system_info is not None
+            assert result.system_info.total_gb == 32
+            assert result.preset == "turboquant_k8v4"  # 24 - 14 = 10 GB overhead
+
+    def test_auto_select_tight_memory(self):
+        with patch("turboquant.auto_select.SystemInfo.detect") as mock_detect:
+            mock_detect.return_value = SystemInfo(
+                total_gb=16, available_gb=12, detection_method="mock"
+            )
+            result = auto_select(model_size_gb=10.0)
+            # 12 - 10 = 2 GB overhead -> turboquant_3bit_nc (min 2 GB)
+            assert result.preset == "turboquant_3bit_nc"
+
+
+# ── Format ────────────────────────────────────────────────────────────────────
+
+class TestFormat:
+    """Test formatting utilities."""
+
+    def test_format_env_commands(self):
+        result = select_preset(available_gb=20, model_size_gb=10)
+        output = format_env_commands(result)
+        assert "export TURBO_LAYER_ADAPTIVE=7" in output
+        assert "turboquant_k8v4" in output
+
+    def test_format_env_commands_with_warnings(self):
+        result = select_preset(available_gb=5, model_size_gb=4)
+        output = format_env_commands(result)
+        assert "WARNING" in output
+
+
+# ── Integration with issue spec ───────────────────────────────────────────────
+
+class TestIssueSpec:
+    """Verify implementation matches the exact logic from issue #97."""
+
+    def test_overhead_8gb_plus(self):
+        """overhead >= 8 -> turboquant_k8v4"""
+        result = select_preset(available_gb=15, model_size_gb=7)  # 8 GB overhead
+        assert result.preset == "turboquant_k8v4"
+
+    def test_overhead_4_to_8(self):
+        """4 <= overhead < 8 -> turboquant_4bit_nc"""
+        result = select_preset(available_gb=11, model_size_gb=7)  # 4 GB overhead
+        assert result.preset == "turboquant_4bit_nc"
+
+    def test_overhead_2_to_4(self):
+        """2 <= overhead < 4 -> turboquant_3bit_nc"""
+        result = select_preset(available_gb=9, model_size_gb=7)  # 2 GB overhead
+        assert result.preset == "turboquant_3bit_nc"
+
+    def test_overhead_under_2(self):
+        """overhead < 2 -> q4_0"""
+        result = select_preset(available_gb=8, model_size_gb=7)  # 1 GB overhead
+        assert result.preset == "q4_0"
diff --git a/turboquant/__init__.py b/turboquant/__init__.py
new file mode 100644
index 00000000..5950c8bb
--- /dev/null
+++ b/turboquant/__init__.py
@@ -0,0 +1,23 @@
+"""TurboQuant — Auto-configuration and preset selection for KV cache compression."""
+
+from turboquant.auto_select import (
+    auto_select,
+    select_preset,
+    list_presets,
+    get_preset_info,
+    SystemInfo,
+    SelectionResult,
+    PRESETS,
+    QUALITY_ORDER,
+)
+
+__all__ = [
+    "auto_select",
+    "select_preset",
+    "list_presets",
+    "get_preset_info",
+    "SystemInfo",
+    "SelectionResult",
+    "PRESETS",
+    "QUALITY_ORDER",
+]
diff --git a/turboquant/auto_select.py b/turboquant/auto_select.py
new file mode 100644
index 00000000..88ae88aa
--- /dev/null
+++ b/turboquant/auto_select.py
@@ -0,0 +1,557 @@
+#!/usr/bin/env python3
+"""
+TurboQuant Auto-Select — Choose optimal KV cache preset based on available memory.
+
+Implements the config helper from issue #97: detects available memory and
+selects the best TurboQuant preset for the deployment environment.
+
+Presets map to deployment configurations (not raw quant levels):
+  turboquant_k8v4   — 8+ GB overhead, best quality (k/v asymmetric: 8-bit K, 4-bit V)
+  turboquant_4bit_nc — 4-8 GB overhead, good quality (4-bit, no calibration)
+  turboquant_3bit_nc — 2-4 GB overhead, usable quality (3-bit, no calibration)
+  q4_0              — <2 GB overhead, GGUF fallback (no vLLM)
+
+Usage:
+    from turboquant.auto_select import auto_select
+
+    result = auto_select(model_size_gb=7.0)
+    print(f"Selected: {result.preset}")
+
+CLI:
+    python3 turboquant/auto_select.py --model-size 7.0
+    python3 turboquant/auto_select.py --model-size 7.0 --json
+    python3 turboquant/auto_select.py --list
+"""
+
+import json
+import logging
+import os
+import platform
+import subprocess
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ── Preset Definitions ────────────────────────────────────────────────────────
+
+PRESETS: Dict[str, Dict[str, Any]] = {
+    "turboquant_k8v4": {
+        "name": "TurboQuant K8V4",
+        "description": "Best quality — asymmetric 8-bit K, 4-bit V",
+        "min_overhead_gb": 8,
+        "compression_ratio": 2.6,
+        "quality": "best",
+        "vllm_compatible": True,
+        "kv_type": "turbo4",
+        "layer_adaptive": 7,
+    },
+    "turboquant_4bit_nc": {
+        "name": "TurboQuant 4-bit NC",
+        "description": "Good quality — 4-bit, no calibration",
+        "min_overhead_gb": 4,
+        "compression_ratio": 3.8,
+        "quality": "good",
+        "vllm_compatible": True,
+        "kv_type": "turbo4",
+        "layer_adaptive": 5,
+    },
+    "turboquant_3bit_nc": {
+        "name": "TurboQuant 3-bit NC",
+        "description": "Usable quality — 3-bit, no calibration",
+        "min_overhead_gb": 2,
+        "compression_ratio": 4.9,
+        "quality": "usable",
+        "vllm_compatible": True,
+        "kv_type": "turbo3",
+        "layer_adaptive": 3,
+    },
+    "q4_0": {
+        "name": "Q4_0 GGUF",
+        "description": "GGUF fallback — no TurboQuant, no vLLM",
+        "min_overhead_gb": 0,
+        "compression_ratio": 4.0,
+        "quality": "basic",
+        "vllm_compatible": False,
+        "kv_type": "q4_0",
+        "layer_adaptive": 0,
+    },
+}
+
+# Ordered from best quality to most aggressive
+QUALITY_ORDER = ["turboquant_k8v4", "turboquant_4bit_nc", "turboquant_3bit_nc", "q4_0"]
+
+
+# ── Data Classes ──────────────────────────────────────────────────────────────
+
+@dataclass
+class SystemInfo:
+    """Detected system resources."""
+    total_gb: float
+    available_gb: float
+    gpu_memory_gb: Optional[float] = None
+    gpu_name: Optional[str] = None
+    is_apple_silicon: bool = False
+    detection_method: str = ""
+
+    @classmethod
+    def detect(cls) -> "SystemInfo":
+        """Detect available memory. Uses platform-appropriate methods."""
+        system = platform.system()
+
+        if system == "Darwin":
+            return cls._detect_macos()
+        elif system == "Linux":
+            return cls._detect_linux()
+        else:
+            return cls._detect_fallback(system)
+
+    @classmethod
+    def _detect_macos(cls) -> "SystemInfo":
+        """Detect Apple Silicon unified memory."""
+        info = cls(total_gb=0, available_gb=0, is_apple_silicon=True, detection_method="sysctl")
+        try:
+            r = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=5)
+            if r.returncode == 0:
+                info.total_gb = int(r.stdout.strip()) / (1024 ** 3)
+
+            r = subprocess.run(["vm_stat"], capture_output=True, text=True, timeout=5)
+            if r.returncode == 0:
+                page_size = 4096
+                free_pages = 0
+                for line in r.stdout.split("\n"):
+                    if "Pages free:" in line:
+                        try:
+                            free_pages = int(line.split(":")[-1].strip().rstrip("."))
+                        except ValueError:
+                            pass
+                info.available_gb = (free_pages * page_size) / (1024 ** 3)
+
+            if info.available_gb < 1:
+                info.available_gb = info.total_gb * 0.70
+
+            info.gpu_memory_gb = info.total_gb  # Unified memory
+
+            r = subprocess.run(
+                ["system_profiler", "SPDisplaysDataType"],
+                capture_output=True, text=True, timeout=10
+            )
+            if r.returncode == 0:
+                for line in r.stdout.split("\n"):
+                    if "Chipset" in line or "GPU" in line:
+                        info.gpu_name = line.split(":")[-1].strip()
+                        break
+        except Exception as e:
+            logger.warning("macOS detection failed: %s", e)
+            info.total_gb = 16.0
+            info.available_gb = 12.0
+            info.detection_method = "fallback"
+
+        return info
+
+    @classmethod
+    def _detect_linux(cls) -> "SystemInfo":
+        """Detect Linux with optional NVIDIA GPU."""
+        info = cls(total_gb=0, available_gb=0, detection_method="proc")
+        try:
+            with open("/proc/meminfo") as f:
+                for line in f:
+                    if line.startswith("MemTotal:"):
+                        info.total_gb = int(line.split()[1]) / (1024 * 1024)
+                    elif line.startswith("MemAvailable:"):
+                        info.available_gb = int(line.split()[1]) / (1024 * 1024)
+
+            try:
+                r = subprocess.run(
+                    ["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
+                     "--format=csv,noheader,nounits"],
+                    capture_output=True, text=True, timeout=10
+                )
+                if r.returncode == 0 and r.stdout.strip():
+                    parts = r.stdout.strip().split("\n")[0].split(", ")
+                    if len(parts) >= 3:
+                        info.gpu_name = parts[0].strip()
+                        info.gpu_memory_gb = float(parts[1]) / 1024
+                        info.available_gb = max(info.available_gb, float(parts[2]) / 1024)
+                        info.detection_method = "nvidia-smi"
+            except (FileNotFoundError, subprocess.TimeoutExpired):
+                pass
+        except Exception as e:
+            logger.warning("Linux detection failed: %s", e)
+            info.total_gb = 16.0
+            info.available_gb = 12.0
+            info.detection_method = "fallback"
+
+        return info
+
+    @classmethod
+    def _detect_fallback(cls, system: str) -> "SystemInfo":
+        """Fallback for unknown systems. Tries psutil."""
+        try:
+            import psutil
+            mem = psutil.virtual_memory()
+            return cls(
+                total_gb=round(mem.total / (1024 ** 3), 1),
+                available_gb=round(mem.available / (1024 ** 3), 1),
+                detection_method="psutil",
+            )
+        except ImportError:
+            return cls(total_gb=16.0, available_gb=12.0, detection_method="fallback")
+
+
+@dataclass
+class SelectionResult:
+    """Result of preset selection."""
+    preset: str
+    reason: str
+    overhead_gb: float
+    quality: str
+    compression_ratio: float
+    vllm_compatible: bool
+    kv_type: str
+    layer_adaptive: int
+    env_vars: Dict[str, str] = field(default_factory=dict)
+    server_flags: Dict[str, str] = field(default_factory=dict)
+    warnings: list = field(default_factory=list)
+    system_info: Optional[SystemInfo] = None
+
+    def to_dict(self) -> dict:
+        d = {
+            "preset": self.preset,
+            "reason": self.reason,
+            "overhead_gb": round(self.overhead_gb, 2),
+            "quality": self.quality,
+            "compression_ratio": self.compression_ratio,
+            "vllm_compatible": self.vllm_compatible,
+            "kv_type": self.kv_type,
+            "layer_adaptive": self.layer_adaptive,
+            "env_vars": self.env_vars,
+            "server_flags": self.server_flags,
+            "warnings": self.warnings,
+        }
+        if self.system_info:
+            d["system"] = {
+                "total_gb": round(self.system_info.total_gb, 1),
+                "available_gb": round(self.system_info.available_gb, 1),
+                "gpu_name": self.system_info.gpu_name,
+                "is_apple_silicon": self.system_info.is_apple_silicon,
+                "detection_method": self.system_info.detection_method,
+            }
+        return d
+
+
+# ── Selection Logic ───────────────────────────────────────────────────────────
+
+def select_preset(
+    available_gb: float,
+    model_size_gb: float,
+    require_vllm: bool = False,
+) -> SelectionResult:
+    """Select the best TurboQuant preset based on memory overhead.
+
+    Args:
+        available_gb: Available system memory in GB
+        model_size_gb: Model size in GB
+        require_vllm: If True, only select vLLM-compatible presets
+
+    Returns:
+        SelectionResult with chosen preset and reasoning
+    """
+    overhead_gb = available_gb - model_size_gb
+
+    if overhead_gb < 0:
+        logger.warning(
+            "Insufficient memory: need %.1f GB, have %.1f GB available",
+            model_size_gb, available_gb
+        )
+        return _make_result("q4_0", overhead_gb,
+                            reason=f"Insufficient memory ({overhead_gb:.1f} GB deficit), using GGUF fallback")
+
+    # Walk quality order (best first), pick first that fits
+    for preset_name in QUALITY_ORDER:
+        preset = PRESETS[preset_name]
+
+        if require_vllm and not preset["vllm_compatible"]:
+            continue
+
+        if overhead_gb >= preset["min_overhead_gb"]:
+            reason = (
+                f"Overhead {overhead_gb:.1f} GB >= {preset['min_overhead_gb']} GB "
+                f"required for {preset['name']}"
+            )
+            return _make_result(preset_name, overhead_gb, reason=reason)
+
+    # Nothing fits — aggressive fallback
+    return _make_result("q4_0", overhead_gb,
+                        reason=f"Overhead {overhead_gb:.1f} GB too low for TurboQuant, using GGUF fallback")
+
+
+def _make_result(preset_name: str, overhead_gb: float, reason: str) -> SelectionResult:
+    """Build a SelectionResult from a preset name."""
+    p = PRESETS[preset_name]
+
+    env_vars = {"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])}
+    server_flags = {"-ctk": p["kv_type"], "-ctv": p["kv_type"]}
+
+    warnings = []
+    if overhead_gb < 2.0 and overhead_gb >= 0:
+        warnings.append(f"Low headroom ({overhead_gb:.1f} GB). Consider reducing context length.")
+    if overhead_gb < 0:
+        warnings.append(
+            f"OVERCOMMITTED: model needs {abs(overhead_gb):.1f} GB more than available. "
+            f"Inference may fail or swap heavily."
+        )
+
+    return SelectionResult(
+        preset=preset_name,
+        reason=reason,
+        overhead_gb=overhead_gb,
+        quality=p["quality"],
+        compression_ratio=p["compression_ratio"],
+        vllm_compatible=p["vllm_compatible"],
+        kv_type=p["kv_type"],
+        layer_adaptive=p["layer_adaptive"],
+        env_vars=env_vars,
+        server_flags=server_flags,
+        warnings=warnings,
+    )
+
+
+# ── Auto-Select (with system detection + config) ─────────────────────────────
+
+def auto_select(
+    model_size_gb: float,
+    config_override: Optional[str] = None,
+    require_vllm: bool = False,
+    config_path: Optional[str] = None,
+) -> SelectionResult:
+    """Auto-select preset based on system detection.
+
+    Args:
+        model_size_gb: Model size in GB
+        config_override: Force a specific preset (skips detection)
+        require_vllm: Require vLLM compatibility
+        config_path: Path to config file (default: $HERMES_HOME/turboquant.json)
+
+    Returns:
+        SelectionResult
+    """
+    # Load config file if exists
+    config = _load_config(config_path)
+
+    # Config override takes precedence
+    override = config_override or config.get("preset_override")
+    if override:
+        if override in PRESETS:
+            logger.info("Using config override: %s", override)
+            p = PRESETS[override]
+            return SelectionResult(
+                preset=override,
+                reason=f"Config override: {p['name']}",
+                overhead_gb=0,
+                quality=p["quality"],
+                compression_ratio=p["compression_ratio"],
+                vllm_compatible=p["vllm_compatible"],
+                kv_type=p["kv_type"],
+                layer_adaptive=p["layer_adaptive"],
+                env_vars={"TURBO_LAYER_ADAPTIVE": str(p["layer_adaptive"])},
+                server_flags={"-ctk": p["kv_type"], "-ctv": p["kv_type"]},
+            )
+        else:
+            logger.warning("Unknown preset override '%s', falling back to auto-select", override)
+
+    # Detect system
+    sys_info = SystemInfo.detect()
+    logger.info(
+        "System: %.1f GB total, %.1f GB available, model: %.1f GB",
+        sys_info.total_gb, sys_info.available_gb, model_size_gb
+    )
+
+    # Select
+    result = select_preset(
+        available_gb=sys_info.available_gb,
+        model_size_gb=model_size_gb,
+        require_vllm=require_vllm,
+    )
+    result.system_info = sys_info
+
+    # Apply context length from config
+    context_length = config.get("context_length")
+    if context_length:
+        result.server_flags["-c"] = str(context_length)
+
+    return result
+
+
+def _load_config(config_path: Optional[str] = None) -> dict:
+    """Load turboquant config from JSON file."""
+    if config_path:
+        p = Path(config_path)
+    else:
+        try:
+            from hermes_constants import get_hermes_home
+            p = get_hermes_home() / "turboquant.json"
+        except (ImportError, Exception):
+            p = Path.home() / ".hermes" / "turboquant.json"
+
+    if p.exists():
+        try:
+            return json.loads(p.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            pass
+    return {}
+
+
+def save_config(values: dict, config_path: Optional[str] = None) -> None:
+    """Save turboquant config to JSON file."""
+    if config_path:
+        p = Path(config_path)
+    else:
+        try:
+            from hermes_constants import get_hermes_home
+            p = get_hermes_home() / "turboquant.json"
+        except (ImportError, Exception):
+            p = Path.home() / ".hermes" / "turboquant.json"
+
+    p.parent.mkdir(parents=True, exist_ok=True)
+
+    existing = {}
+    if p.exists():
+        try:
+            existing = json.loads(p.read_text(encoding="utf-8"))
+        except (json.JSONDecodeError, OSError):
+            pass
+
+    existing.update(values)
+    p.write_text(json.dumps(existing, indent=2), encoding="utf-8")
+    logger.info("Config saved to %s", p)
+
+
+# ── Utility ───────────────────────────────────────────────────────────────────
+
+def get_preset_info(preset_name: str) -> Optional[dict]:
+    """Get information about a preset."""
+    return PRESETS.get(preset_name)
+
+
+def list_presets() -> dict:
+    """List all available presets."""
+    return PRESETS.copy()
+
+
+def format_env_commands(result: SelectionResult) -> str:
+    """Format result as shell export commands."""
+    lines = [f"# TurboQuant preset: {result.preset} ({result.quality}, {result.compression_ratio}x)"]
+    lines.append(f"# {result.reason}")
+    for k, v in result.env_vars.items():
+        lines.append(f"export {k}={v}")
+    flags = " ".join(f"{k} {v}" for k, v in result.server_flags.items())
+    if flags:
+        lines.append(f"# llama-server flags: {flags}")
+    for w in result.warnings:
+        lines.append(f"# WARNING: {w}")
+    return "\n".join(lines)
+
+
+# ── CLI ───────────────────────────────────────────────────────────────────────
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Auto-select TurboQuant preset based on available memory",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s --model-size 14.0           # Auto-detect and select
+  %(prog)s --model-size 7.0 --json     # JSON output
+  %(prog)s --model-size 7.0 --list     # List available presets
+  %(prog)s --model-size 7.0 --require-vllm
+  %(prog)s --model-size 7.0 --preset turboquant_4bit_nc  # Override
+        """,
+    )
+    parser.add_argument("--model-size", type=float, help="Model size in GB")
+    parser.add_argument("--preset", help="Force a specific preset (skip detection)")
+    parser.add_argument("--require-vllm", action="store_true", help="Require vLLM compatibility")
+    parser.add_argument("--json", action="store_true", help="JSON output")
+    parser.add_argument("--list", action="store_true", help="List available presets")
+    parser.add_argument("--shell", action="store_true", help="Output as shell export commands")
+    parser.add_argument("--detect-only", action="store_true", help="Only detect hardware")
+    parser.add_argument("--config", help="Path to config file")
+
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    if args.list:
+        print("Available presets:")
+        print(f"  {'Name':22} {'Quality':8} {'Compress':8} {'vLLM':5} {'Overhead':8} Description")
+        for name in QUALITY_ORDER:
+            p = PRESETS[name]
+            vllm = "yes" if p["vllm_compatible"] else "no"
+            print(
+                f"  {name:22} {p['quality']:8} {p['compression_ratio']:.1f}x     "
+                f"{vllm:5} >= {p['min_overhead_gb']:2.0f} GB   {p['description']}"
+            )
+        return
+
+    if args.detect_only:
+        info = SystemInfo.detect()
+        if args.json:
+            print(json.dumps({
+                "total_gb": round(info.total_gb, 1),
+                "available_gb": round(info.available_gb, 1),
+                "gpu_memory_gb": round(info.gpu_memory_gb, 1) if info.gpu_memory_gb else None,
+                "gpu_name": info.gpu_name,
+                "is_apple_silicon": info.is_apple_silicon,
+                "detection_method": info.detection_method,
+            }, indent=2))
+        else:
+            print(f"Total memory:    {info.total_gb:.1f} GB")
+            print(f"Available:       {info.available_gb:.1f} GB")
+            if info.gpu_memory_gb:
+                print(f"GPU memory:      {info.gpu_memory_gb:.1f} GB")
+            if info.gpu_name:
+                print(f"GPU:             {info.gpu_name}")
+            if info.is_apple_silicon:
+                print(f"Chip:            Apple Silicon")
+            print(f"Detection:       {info.detection_method}")
+        return
+
+    if not args.model_size:
+        parser.error("--model-size is required (unless using --list or --detect-only)")
+
+    result = auto_select(
+        model_size_gb=args.model_size,
+        config_override=args.preset,
+        require_vllm=args.require_vllm,
+        config_path=args.config,
+    )
+
+    if args.json:
+        print(json.dumps(result.to_dict(), indent=2))
+    elif args.shell:
+        print(format_env_commands(result))
+    else:
+        print(f"Selected: {result.preset} ({result.quality}, {result.compression_ratio}x)")
+        print(f"  {result.reason}")
+        if result.system_info:
+            print(f"  System: {result.system_info.total_gb:.0f} GB total, {result.system_info.available_gb:.0f} GB available")
+        print(f"  Overhead: {result.overhead_gb:.1f} GB")
+        print()
+        print(f"Environment:")
+        for k, v in result.env_vars.items():
+            print(f"  export {k}={v}")
+        print(f"Server flags:")
+        for k, v in result.server_flags.items():
+            print(f"  {k} {v}")
+        if result.warnings:
+            print()
+            for w in result.warnings:
+                print(f"  WARNING: {w}")
+
+
+if __name__ == "__main__":
+    main()