- Add gemma4-turboquant.yaml profile for Hermes - Configure local llama.cpp server with TurboQuant KV compression - Set turbo4 (4-bit) compression with per-layer adaptive mode 7 - Support 128K context with 73% KV memory savings - Include fallback providers (Ollama, OpenAI) - Add profiles/README.md with setup and usage instructions - Document performance expectations and troubleshooting Closes #28
170 lines
4.7 KiB
YAML
170 lines
4.7 KiB
YAML
# Hermes Profile: Gemma 4 + TurboQuant KV Cache Compression
|
|
# For use with local llama.cpp server running TurboQuant-enabled inference
|
|
# Drop into ~/.hermes/profiles/gemma4-turboquant.yaml
|
|
|
|
profile:
|
|
name: "gemma4-turboquant"
|
|
version: "1.0.0"
|
|
description: "Gemma 4 model with TurboQuant KV cache compression for extended context on Apple Silicon"
|
|
|
|
# Primary provider: local llama.cpp server with TurboQuant
|
|
providers:
|
|
primary:
|
|
type: "llama.cpp"
|
|
name: "local-turboquant"
|
|
endpoint: "http://localhost:8081"
|
|
api_path: "/v1/chat/completions"
|
|
timeout_ms: 120000
|
|
|
|
# Model configuration
|
|
model:
|
|
name: "gemma-4"
|
|
path: "/path/to/gemma-4-q4_k_m.gguf" # Update with actual model path
|
|
|
|
# TurboQuant KV cache compression settings
|
|
turboquant:
|
|
enabled: true
|
|
kv_type: "turbo4" # Options: turbo2, turbo3, turbo4 (4-bit recommended)
|
|
layer_adaptive_mode: 7 # Per-layer adaptive quantization (0-7, 7=best quality/ratio)
|
|
|
|
# Context and memory settings
|
|
context:
|
|
max_tokens: 131072 # 128K context with TurboQuant compression
|
|
batch_size: 512
|
|
|
|
# Generation parameters
|
|
generation:
|
|
temperature: 0.7
|
|
top_p: 0.9
|
|
top_k: 40
|
|
repeat_penalty: 1.1
|
|
frequency_penalty: 0.0
|
|
presence_penalty: 0.0
|
|
|
|
# Server startup command (for reference)
|
|
server_command: |
|
|
export TURBO_LAYER_ADAPTIVE=7
|
|
llama-server \
|
|
-m /path/to/gemma-4-q4_k_m.gguf \
|
|
--port 8081 \
|
|
-ctk turbo4 -ctv turbo4 \
|
|
-c 131072 \
|
|
--host 0.0.0.0
|
|
|
|
# Fallback provider 1: Ollama (standard, no TurboQuant)
|
|
fallback_1:
|
|
type: "ollama"
|
|
name: "ollama-gemma4"
|
|
endpoint: "http://localhost:11434"
|
|
api_path: "/api/chat"
|
|
timeout_ms: 120000
|
|
|
|
model:
|
|
name: "gemma4:latest"
|
|
|
|
generation:
|
|
temperature: 0.7
|
|
top_p: 0.9
|
|
top_k: 40
|
|
|
|
# Fallback provider 2: OpenAI-compatible API (cloud backup)
|
|
fallback_2:
|
|
type: "openai"
|
|
name: "openai-backup"
|
|
endpoint: "https://api.openai.com"
|
|
api_path: "/v1/chat/completions"
|
|
timeout_ms: 60000
|
|
|
|
model:
|
|
name: "gpt-4"
|
|
|
|
generation:
|
|
temperature: 0.7
|
|
max_tokens: 4096
|
|
|
|
# Performance and monitoring
|
|
performance:
|
|
# Memory management for TurboQuant
|
|
memory:
|
|
max_gpu_memory_gb: 28 # Leave headroom on 36GB M3 Max
|
|
kv_cache_compression: "turbo4"
|
|
estimated_savings: "73%" # TurboQuant delivers ~73% KV memory savings
|
|
|
|
# Benchmarking integration
|
|
benchmarks:
|
|
enabled: true
|
|
metrics:
|
|
- "tokens_per_second"
|
|
- "time_to_first_token"
|
|
- "peak_memory_usage"
|
|
- "perplexity"
|
|
|
|
# Quality validation
|
|
quality:
|
|
# Test prompts for quality comparison
|
|
test_prompts:
|
|
enabled: true
|
|
prompt_file: "benchmarks/prompts.json"
|
|
|
|
# Perplexity testing
|
|
perplexity:
|
|
enabled: true
|
|
corpus: "wikitext-2-raw"
|
|
context_lengths: [8192, 32768, 65536, 131072]
|
|
|
|
# Environment variables (applied when using this profile)
|
|
environment:
|
|
TURBO_LAYER_ADAPTIVE: "7" # Per-layer adaptive quantization mode
|
|
GGML_METAL_DEBUG: "0" # Disable Metal debug in production
|
|
OMP_NUM_THREADS: "8" # Optimize for M3 Max performance cores
|
|
|
|
# Logging and diagnostics
|
|
logging:
|
|
level: "info"
|
|
metrics_interval_seconds: 60
|
|
log_token_speed: true
|
|
log_memory_usage: true
|
|
|
|
# Notes for deployment
|
|
notes:
|
|
deployment: |
|
|
1. Ensure llama.cpp fork with TurboQuant is built:
|
|
cd /path/to/llama-cpp-turboquant
|
|
git checkout feature/turboquant-kv-cache
|
|
cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
|
|
cmake --build build -j$(sysctl -n hw.ncpu)
|
|
|
|
2. Start the server:
|
|
export TURBO_LAYER_ADAPTIVE=7
|
|
./build/bin/llama-server \
|
|
-m /path/to/gemma-4-q4_k_m.gguf \
|
|
--port 8081 \
|
|
-ctk turbo4 -ctv turbo4 \
|
|
-c 131072 \
|
|
--host 0.0.0.0
|
|
|
|
3. Verify server is running:
|
|
curl http://localhost:8081/v1/models
|
|
|
|
4. Copy this profile to Hermes:
|
|
cp hermes-profile-gemma4-turboquant.yaml ~/.hermes/profiles/
|
|
|
|
performance_notes: |
|
|
TurboQuant delivers:
|
|
- 73% KV cache memory savings
|
|
- 1% prompt processing overhead
|
|
- 11% generation overhead
|
|
- Enables 128K context on 36GB hardware
|
|
|
|
With TurboQuant on Gemma 4 (estimated):
|
|
- Model weights: ~16GB at Q4_K_M
|
|
- KV cache at 128K: ~5GB (vs ~20GB without compression)
|
|
- Total memory: ~23GB (fits comfortably in 31GB budget)
|
|
|
|
troubleshooting: |
|
|
- If generation speed is slow, try turbo3 instead of turbo4
|
|
- If quality issues, disable per-layer adaptive (set mode to 0)
|
|
- For maximum quality on sensitive layers, use asymmetric K/V:
|
|
-ctk q8_0 -ctv turbo4
|
|
- Monitor memory with: vmmap --summary $(pgrep llama-server)
|