turboquant/profiles/hermes-profile-gemma4-turboquant.yaml

# Hermes Profile: Gemma 4 + TurboQuant KV Cache Compression
# For use with local llama.cpp server running TurboQuant-enabled inference
# Drop into ~/.hermes/profiles/gemma4-turboquant.yaml

profile:
  name: "gemma4-turboquant"
  version: "1.0.0"
  description: "Gemma 4 model with TurboQuant KV cache compression for extended context on Apple Silicon"

# Primary provider: local llama.cpp server with TurboQuant
providers:
  primary:
    type: "llama.cpp"
    name: "local-turboquant"
    endpoint: "http://localhost:8081"
    api_path: "/v1/chat/completions"
    timeout_ms: 120000

    # Model configuration
    model:
      name: "gemma-4"
      path: "/path/to/gemma-4-q4_k_m.gguf"  # Update with actual model path

    # TurboQuant KV cache compression settings
    turboquant:
      enabled: true
      kv_type: "turbo4"  # Options: turbo2, turbo3, turbo4 (4-bit recommended)
      layer_adaptive_mode: 7  # Per-layer adaptive quantization (0-7, 7=best quality/ratio)

    # Context and memory settings
    context:
      max_tokens: 131072  # 128K context with TurboQuant compression
      batch_size: 512

    # Generation parameters
    generation:
      temperature: 0.7
      top_p: 0.9
      top_k: 40
      repeat_penalty: 1.1
      frequency_penalty: 0.0
      presence_penalty: 0.0

    # Server startup command (for reference)
    server_command: |
      export TURBO_LAYER_ADAPTIVE=7
      llama-server \
        -m /path/to/gemma-4-q4_k_m.gguf \
        --port 8081 \
        -ctk turbo4 -ctv turbo4 \
        -c 131072 \
        --host 0.0.0.0

  # Fallback provider 1: Ollama (standard, no TurboQuant)
  fallback_1:
    type: "ollama"
    name: "ollama-gemma4"
    endpoint: "http://localhost:11434"
    api_path: "/api/chat"
    timeout_ms: 120000

    model:
      name: "gemma4:latest"

    generation:
      temperature: 0.7
      top_p: 0.9
      top_k: 40

  # Fallback provider 2: OpenAI-compatible API (cloud backup)
  fallback_2:
    type: "openai"
    name: "openai-backup"
    endpoint: "https://api.openai.com"
    api_path: "/v1/chat/completions"
    timeout_ms: 60000

    model:
      name: "gpt-4"

    generation:
      temperature: 0.7
      max_tokens: 4096

# Performance and monitoring
performance:
  # Memory management for TurboQuant
  memory:
    max_gpu_memory_gb: 28  # Leave headroom on 36GB M3 Max
    kv_cache_compression: "turbo4"
    estimated_savings: "73%"  # TurboQuant delivers ~73% KV memory savings

  # Benchmarking integration
  benchmarks:
    enabled: true
    metrics:
      - "tokens_per_second"
      - "time_to_first_token"
      - "peak_memory_usage"
      - "perplexity"

# Quality validation
quality:
  # Test prompts for quality comparison
  test_prompts:
    enabled: true
    prompt_file: "benchmarks/prompts.json"

  # Perplexity testing
  perplexity:
    enabled: true
    corpus: "wikitext-2-raw"
    context_lengths: [8192, 32768, 65536, 131072]

# Environment variables (applied when using this profile)
environment:
  TURBO_LAYER_ADAPTIVE: "7"  # Per-layer adaptive quantization mode
  GGML_METAL_DEBUG: "0"  # Disable Metal debug in production
  OMP_NUM_THREADS: "8"  # Optimize for M3 Max performance cores

# Logging and diagnostics
logging:
  level: "info"
  metrics_interval_seconds: 60
  log_token_speed: true
  log_memory_usage: true

# Notes for deployment
notes:
  deployment: |
    1. Ensure llama.cpp fork with TurboQuant is built:
       cd /path/to/llama-cpp-turboquant
       git checkout feature/turboquant-kv-cache
       cmake -B build -DGGML_METAL=ON -DCMAKE_BUILD_TYPE=Release
       cmake --build build -j$(sysctl -n hw.ncpu)

    2. Start the server:
       export TURBO_LAYER_ADAPTIVE=7
       ./build/bin/llama-server \
         -m /path/to/gemma-4-q4_k_m.gguf \
         --port 8081 \
         -ctk turbo4 -ctv turbo4 \
         -c 131072 \
         --host 0.0.0.0

    3. Verify server is running:
       curl http://localhost:8081/v1/models

    4. Copy this profile to Hermes:
       cp hermes-profile-gemma4-turboquant.yaml ~/.hermes/profiles/

  performance_notes: |
    TurboQuant delivers:
    - 73% KV cache memory savings
    - 1% prompt processing overhead
    - 11% generation overhead
    - Enables 128K context on 36GB hardware

    With TurboQuant on Gemma 4 (estimated):
    - Model weights: ~16GB at Q4_K_M
    - KV cache at 128K: ~5GB (vs ~20GB without compression)
    - Total memory: ~23GB (fits comfortably in 31GB budget)

  troubleshooting: |
    - If generation speed is slow, try turbo3 instead of turbo4
    - If quality issues, disable per-layer adaptive (set mode to 0)
    - For maximum quality on sensitive layers, use asymmetric K/V:
      -ctk q8_0 -ctv turbo4
    - Monitor memory with: vmmap --summary $(pgrep llama-server)