2026-04-15 15:04:41 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""Tests for quant_selector.py"""
|
|
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
import os
|
|
|
|
|
import pytest
|
|
|
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
|
|
|
from evolution.quant_selector import (
|
|
|
|
|
QuantLevel,
|
|
|
|
|
HardwareInfo,
|
|
|
|
|
QUANT_LEVELS,
|
|
|
|
|
detect_hardware,
|
|
|
|
|
estimate_kv_cache_gb,
|
|
|
|
|
estimate_model_memory_gb,
|
|
|
|
|
select_quant_level,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestQuantLevels:
|
|
|
|
|
def test_levels_ordered_by_quality(self):
|
2026-04-21 07:25:52 -04:00
|
|
|
"""TurboQuant levels should be ordered from best quality to most aggressive.
|
|
|
|
|
|
|
|
|
|
The quality ordering invariant for TurboQuant levels is monotonically
|
|
|
|
|
increasing compression_ratio (more aggressive = more compression).
|
|
|
|
|
Non-TurboQuant fallbacks (e.g. q4_0) are placed after all TurboQuant
|
|
|
|
|
levels and may have any compression ratio — they exist as safe defaults,
|
|
|
|
|
not as part of the quality progression.
|
|
|
|
|
"""
|
|
|
|
|
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
|
|
|
|
|
turbo_levels = [l for l in QUANT_LEVELS if l.name in turbo_quant_names]
|
|
|
|
|
for i in range(len(turbo_levels) - 1):
|
|
|
|
|
assert turbo_levels[i].compression_ratio <= turbo_levels[i + 1].compression_ratio, (
|
|
|
|
|
f"TurboQuant {turbo_levels[i].name} (compression={turbo_levels[i].compression_ratio}x) "
|
|
|
|
|
f"should have <= compression than {turbo_levels[i+1].name} "
|
|
|
|
|
f"(compression={turbo_levels[i+1].compression_ratio}x)"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def test_fallback_quant_is_last(self):
|
|
|
|
|
"""Non-TurboQuant fallbacks (e.g. q4_0) should be at the end of the list."""
|
|
|
|
|
turbo_quant_names = {"turbo4", "turbo3", "turbo2"}
|
|
|
|
|
found_fallback = False
|
|
|
|
|
for level in QUANT_LEVELS:
|
|
|
|
|
if level.name not in turbo_quant_names:
|
|
|
|
|
found_fallback = True
|
|
|
|
|
elif found_fallback:
|
|
|
|
|
pytest.fail(
|
|
|
|
|
f"TurboQuant level '{level.name}' appears after a fallback level. "
|
|
|
|
|
f"All TurboQuant levels must precede fallbacks."
|
|
|
|
|
)
|
2026-04-15 15:04:41 +00:00
|
|
|
|
|
|
|
|
def test_all_levels_have_required_fields(self):
|
|
|
|
|
for level in QUANT_LEVELS:
|
|
|
|
|
assert level.name
|
|
|
|
|
assert level.bits_per_channel > 0
|
|
|
|
|
assert level.compression_ratio > 1
|
|
|
|
|
assert level.quality_label
|
|
|
|
|
assert level.layer_adaptive >= 0
|
|
|
|
|
assert level.kv_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestKVEstimate:
|
|
|
|
|
def test_basic_estimate(self):
|
|
|
|
|
# 48 layers, 8 heads, 128 dim, 32K context, 3.5 bits
|
|
|
|
|
kv_gb = estimate_kv_cache_gb(32768, 48, 8, 128, 3.5)
|
|
|
|
|
assert kv_gb > 0
|
|
|
|
|
assert kv_gb < 10 # Should be reasonable
|
|
|
|
|
|
|
|
|
|
def test_longer_context_larger(self):
|
|
|
|
|
kv_32k = estimate_kv_cache_gb(32768, 48, 8, 128, 3.5)
|
|
|
|
|
kv_128k = estimate_kv_cache_gb(131072, 48, 8, 128, 3.5)
|
|
|
|
|
assert kv_128k > kv_32k
|
|
|
|
|
|
|
|
|
|
def test_higher_bits_larger(self):
|
|
|
|
|
kv_4b = estimate_kv_cache_gb(32768, 48, 8, 128, 4.0)
|
|
|
|
|
kv_2b = estimate_kv_cache_gb(32768, 48, 8, 128, 2.0)
|
|
|
|
|
assert kv_4b > kv_2b
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestHardwareDetection:
|
|
|
|
|
def test_detect_returns_info(self):
|
|
|
|
|
hw = detect_hardware()
|
|
|
|
|
assert hw.total_memory_gb > 0
|
|
|
|
|
assert hw.available_memory_gb > 0
|
|
|
|
|
assert hw.detection_method
|
|
|
|
|
|
|
|
|
|
@patch("evolution.quant_selector.platform.system", return_value="Linux")
|
|
|
|
|
@patch("builtins.open", create=True)
|
|
|
|
|
def test_linux_detection(self, mock_open, mock_system):
|
|
|
|
|
mock_open.return_value.__enter__().read.return_value = (
|
|
|
|
|
"MemTotal: 32000000 kB\n"
|
|
|
|
|
"MemAvailable: 24000000 kB\n"
|
|
|
|
|
)
|
|
|
|
|
hw = _detect_linux_fallback()
|
|
|
|
|
assert hw.total_memory_gb > 20
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _detect_linux_fallback():
|
|
|
|
|
"""Helper to test Linux detection with mocked /proc/meminfo."""
|
|
|
|
|
from evolution.quant_selector import _detect_linux
|
|
|
|
|
return _detect_linux()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestSelection:
|
|
|
|
|
def test_selects_turbo4_for_large_memory(self):
|
|
|
|
|
"""With plenty of memory, should pick turbo4 (best quality)."""
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=64,
|
|
|
|
|
available_memory_gb=48,
|
|
|
|
|
gpu_memory_gb=64,
|
|
|
|
|
gpu_name="Test GPU",
|
|
|
|
|
cpu_cores=16,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
|
|
|
|
assert sel.level.name == "turbo4"
|
|
|
|
|
assert sel.headroom_gb > 0
|
|
|
|
|
|
|
|
|
|
def test_selects_smaller_for_tight_memory(self):
|
|
|
|
|
"""With tight memory, should pick a smaller quant."""
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=16,
|
|
|
|
|
available_memory_gb=12,
|
|
|
|
|
gpu_memory_gb=16,
|
|
|
|
|
gpu_name="Test GPU",
|
|
|
|
|
cpu_cores=8,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(model_size_gb=14.0, context_length=131072)
|
|
|
|
|
# Should pick a smaller quant for 128K context on 16GB
|
|
|
|
|
assert sel.level.bits_per_channel <= 4.0
|
|
|
|
|
|
|
|
|
|
def test_preferred_level(self):
|
|
|
|
|
"""User can force a specific level."""
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=64,
|
|
|
|
|
available_memory_gb=48,
|
|
|
|
|
cpu_cores=16,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(
|
|
|
|
|
model_size_gb=14.0, context_length=32768,
|
|
|
|
|
preferred_level="turbo2"
|
|
|
|
|
)
|
|
|
|
|
assert sel.level.name == "turbo2"
|
|
|
|
|
|
|
|
|
|
def test_env_vars_populated(self):
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=64,
|
|
|
|
|
available_memory_gb=48,
|
|
|
|
|
cpu_cores=16,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
|
|
|
|
assert "TURBO_LAYER_ADAPTIVE" in sel.env_vars
|
|
|
|
|
assert "-ctk" in sel.server_flags
|
|
|
|
|
assert "-ctv" in sel.server_flags
|
|
|
|
|
|
|
|
|
|
def test_warnings_on_low_headroom(self):
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=18,
|
|
|
|
|
available_memory_gb=14,
|
|
|
|
|
gpu_memory_gb=18,
|
|
|
|
|
gpu_name="Test GPU",
|
|
|
|
|
cpu_cores=8,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(model_size_gb=16.0, context_length=65536)
|
|
|
|
|
assert len(sel.warnings) > 0
|
|
|
|
|
|
|
|
|
|
def test_reasoning_contains_key_info(self):
|
|
|
|
|
with patch("evolution.quant_selector.detect_hardware") as mock_hw:
|
|
|
|
|
mock_hw.return_value = HardwareInfo(
|
|
|
|
|
total_memory_gb=32,
|
|
|
|
|
available_memory_gb=24,
|
|
|
|
|
is_apple_silicon=True,
|
|
|
|
|
chip_name="M4 Max",
|
|
|
|
|
cpu_cores=16,
|
|
|
|
|
detection_method="mock",
|
|
|
|
|
)
|
|
|
|
|
sel = select_quant_level(model_size_gb=14.0, context_length=32768)
|
|
|
|
|
assert "turbo4" in sel.reasoning
|
|
|
|
|
assert "M4 Max" in sel.reasoning or "32GB" in sel.reasoning
|