# Hermes Profile: Gemma 4 + TurboQuant KV Cache Compression # For use with local llama.cpp server running TurboQuant-enabled inference # Drop into ~/.hermes/profiles/gemma4-turboquant.yaml profile: name: "gemma4-turboquant" version: "1.0.0" description: "Gemma 4 model with TurboQuant KV cache compression for extended context on Apple Silicon" # Primary provider: local llama.cpp server with TurboQuant providers: primary: type: "llama.cpp" name: "local-turboquant" endpoint: "http://localhost:8081" api_path: "/v1/chat/completions" timeout_ms: 120000 # Model configuration model: name: "gemma-4" path: "/path/to/gemma-4-q4_k_m.gguf" # Update with actual model path # TurboQuant KV cache compression settings turboquant: enabled: true kv_type: "turbo4" # Options: turbo2, turbo3, turbo4 (4-bit recommended) layer_adaptive_mode: 7 # Per-layer adaptive quantization (0-7, 7=best quality/ratio) # Context and memory settings context: max_tokens: 131072 # 128K context with TurboQuant compression batch_size: 512 # Generation parameters generation: temperature: 0.7 top_p: 0.9 top_k: 40 repeat_penalty: 1.1 frequency_penalty: 0.0 presence_penalty: 0.0 # Server startup command (for reference) server_command: | export TURBO_LAYER_ADAPTIVE=7 llama-server \ -m /path/to/gemma-4-q4_k_m.gguf \ --port 8081 \ -ctk turbo4 -ctv turbo4 \ -c 131072 \ --host 0.0.0.0 # Fallback provider 1: Ollama (standard, no TurboQuant) fallback_1: type: "ollama" name: "ollama-gemma4" endpoint: "http://localhost:11434" api_path: "/api/chat" timeout_ms: 120000 model: name: "gemma4:latest" generation: temperature: 0.7 top_p: 0.9 top_k: 40 # Fallback provider 2: OpenAI-compatible API (cloud backup) fallback_2: type: "openai" name: "openai-backup" endpoint: "https://api.openai.com" api_path: "/v1/chat/completions" timeout_ms: 60000 model: name: "gpt-4" generation: temperature: 0.7 max_tokens: 4096 # Performance and monitoring performance: # Memory management for TurboQuant memory: max_gpu_memory_gb: 28 # Leave headroom on 36GB M3 Max kv_cache_compression: "turbo4" estimated_savings: "73%" # TurboQuant delivers ~73% KV memory savings # Benchmarking integration benchmarks: enabled: true metrics: - "tokens_per_second" - "time_to_first_token" - "peak_memory_usage" - "perplexity" # Quality validation quality: # Test prompts for quality comparison test_prompts: enabled: true prompt_file: "benchmarks/prompts.json" # Perplexity testing perplexity: enabled: true corpus: "wikitext-2-raw" context_lengths: [8192, 32768, 65536, 131072] # Environment variables (applied when using this profile) environment: TURBO_LAYER_ADAPTIVE: "7" # Per-layer adaptive quantization mode GGML_METAL_DEBUG: "0" # Disable Metal debug in production OMP_NUM_THREADS: "8" # Optimize for M3 Max performance cores # Logging and diagnostics logging: level: "info" metrics_interval_seconds: 60 log_token_speed: true log_memory_usage: true # Notes for deployment notes: deployment: | 1. Ensure llama.cpp fork with TurboQuant is built: cd /path/to/llama-cpp-turboquant git checkout feature/turboquant-kv-cache cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release cmake --build build -j$(sysctl -n hw.ncpu) 2. Start the server: export TURBO_LAYER_ADAPTIVE=7 ./build/bin/llama-server \ -m /path/to/gemma-4-q4_k_m.gguf \ --port 8081 \ -ctk turbo4 -ctv turbo4 \ -c 131072 \ --host 0.0.0.0 3. Verify server is running: curl http://localhost:8081/v1/models 4. Copy this profile to Hermes: cp hermes-profile-gemma4-turboquant.yaml ~/.hermes/profiles/ performance_notes: | TurboQuant delivers: - 73% KV cache memory savings - 1% prompt processing overhead - 11% generation overhead - Enables 128K context on 36GB hardware With TurboQuant on Gemma 4 (estimated): - Model weights: ~16GB at Q4_K_M - KV cache at 128K: ~5GB (vs ~20GB without compression) - Total memory: ~23GB (fits comfortably in 31GB budget) troubleshooting: | - If generation speed is slow, try turbo3 instead of turbo4 - If quality issues, disable per-layer adaptive (set mode to 0) - For maximum quality on sensitive layers, use asymmetric K/V: -ctk q8_0 -ctv turbo4 - Monitor memory with: vmmap --summary $(pgrep llama-server)