feat: add Allegro VPS preset configurations (#95)

2026-04-16 01:50:50 +00:00
parent 3cd8750cbb
commit 70d292c222
1 changed files with 164 additions and 0 deletions
--- a/profiles/allegro-cpu-presets.yaml
+++ b/profiles/allegro-cpu-presets.yaml
@@ -0,0 +1,164 @@
 # Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference
 # Optimized for the Timmy Foundation Allegro server (167.99.126.228)
 #
 # Hardware constraints:
 #   - 2 CPU cores (no GPU)
 #   - 8GB RAM total
 #   - ~2GB reserved for OS + hermes agent
 #   - ~6GB available for model + KV cache
 #
 # Strategy: GGUF quantization via llama.cpp (CPU-optimized)
 # KV cache compression via TurboQuant to maximize context within RAM
 hardware:
  hostname: "allegro"
  ip: "167.99.126.228"
  cores: 2
  ram_gb: 8
  gpu: false
  os_reserved_gb: 2
  available_gb: 6
  arch: "x86_64"
  cpu_backend: "llama.cpp"
 presets:
  # ─── TIER 1: Conservative (fits comfortably) ──────────────────────
  tiny:
    name: "tiny-2b-q4"
    description: "2B param model, Q4_K_M — leaves headroom for other processes"
    model_size_gb: 1.5
    quantization: "Q4_K_M"
    context_tokens: 4096
    kv_type: "f16"
    estimated_ram_gb: 2.8
    fits_in_allegro: true
    server_flags:
      threads: 2
      context: 4096
      batch: 256
    expected_perf:
      tokens_per_sec: "8-15"
      ttft_s: "1.5-3.0"
      use_case: "Simple Q&A, short completions, triage"
    ollama_model: "qwen2.5:1.5b"
    llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
  small:
    name: "small-3b-q4"
    description: "3B param model, Q4_K_M — sweet spot for value on 2 cores"
    model_size_gb: 2.0
    quantization: "Q4_K_M"
    context_tokens: 8192
    kv_type: "turbo2"
    estimated_ram_gb: 3.6
    fits_in_allegro: true
    server_flags:
      threads: 2
      context: 8192
      batch: 512
      ctk: "q4_0"
      ctv: "q4_0"
    expected_perf:
      tokens_per_sec: "5-10"
      ttft_s: "2.0-5.0"
      use_case: "Code generation, tool calling, burn-loop workers"
    ollama_model: "qwen2.5:3b"
    llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf"
  # ─── TIER 2: Balanced (recommended default) ───────────────────────
  medium:
    name: "medium-7b-q4"
    description: "7B param model, Q4_K_M + TurboQuant — best quality that fits"
    model_size_gb: 4.1
    quantization: "Q4_K_M"
    context_tokens: 8192
    kv_type: "turbo4"
    estimated_ram_gb: 5.2
    fits_in_allegro: true
    server_flags:
      threads: 2
      context: 8192
      batch: 512
      ctk: "q4_0"
      ctv: "q4_0"
      layer_adaptive: 7
    expected_perf:
      tokens_per_sec: "2-5"
      ttft_s: "4.0-8.0"
      use_case: "Complex reasoning, multi-turn conversation, analysis"
    ollama_model: "qwen2.5:7b"
    llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
  medium_long:
    name: "medium-7b-q4-long"
    description: "7B Q4 + aggressive TurboQuant for 32K context"
    model_size_gb: 4.1
    quantization: "Q4_K_M"
    context_tokens: 32768
    kv_type: "turbo4"
    estimated_ram_gb: 5.8
    fits_in_allegro: true
    server_flags:
      threads: 2
      context: 32768
      batch: 256
      ctk: "q3_k"
      ctv: "q3_k"
      layer_adaptive: 7
    expected_perf:
      tokens_per_sec: "1.5-4"
      ttft_s: "6.0-12.0"
      use_case: "Long document analysis, code review, research"
    ollama_model: "qwen2.5:7b"
    llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
  # ─── TIER 3: Pushing limits (may swap) ────────────────────────────
  large:
    name: "large-14b-q3"
    description: "14B param model, Q3_K_M — may page to swap, use with caution"
    model_size_gb: 6.5
    quantization: "Q3_K_M"
    context_tokens: 4096
    kv_type: "turbo4"
    estimated_ram_gb: 7.2
    fits_in_allegro: false
    warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs."
    server_flags:
      threads: 2
      context: 4096
      batch: 256
      ctk: "q3_k"
      ctv: "q3_k"
      layer_adaptive: 7
    expected_perf:
      tokens_per_sec: "0.5-2"
      ttft_s: "10.0-30.0"
      use_case: "Batch processing, overnight jobs (with swap)"
    ollama_model: "qwen2.5:14b"
    llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf"
 # Recommended default for Allegro
 recommended_preset: "medium"
 # Server startup examples
 examples:
  ollama: |
    # Pull and run
    ollama pull qwen2.5:7b
    ollama run qwen2.5:7b
  llama_cpp: |
    # With TurboQuant KV cache
    export TURBO_LAYER_ADAPTIVE=7
    llama-server \
      -m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
      --port 8081 \
      -t 2 \
      -c 8192 \
      -b 512 \
      -ctk q4_0 -ctv q4_0 \
      --host 0.0.0.0
  hermes_profile: |
    # Use with hermes agent
    hermes -p allegro-medium chat