feat: add Allegro VPS preset configurations (#95)
This commit is contained in:
164
profiles/allegro-cpu-presets.yaml
Normal file
164
profiles/allegro-cpu-presets.yaml
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
# Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference
|
||||||
|
# Optimized for the Timmy Foundation Allegro server (167.99.126.228)
|
||||||
|
#
|
||||||
|
# Hardware constraints:
|
||||||
|
# - 2 CPU cores (no GPU)
|
||||||
|
# - 8GB RAM total
|
||||||
|
# - ~2GB reserved for OS + hermes agent
|
||||||
|
# - ~6GB available for model + KV cache
|
||||||
|
#
|
||||||
|
# Strategy: GGUF quantization via llama.cpp (CPU-optimized)
|
||||||
|
# KV cache compression via TurboQuant to maximize context within RAM
|
||||||
|
|
||||||
|
hardware:
|
||||||
|
hostname: "allegro"
|
||||||
|
ip: "167.99.126.228"
|
||||||
|
cores: 2
|
||||||
|
ram_gb: 8
|
||||||
|
gpu: false
|
||||||
|
os_reserved_gb: 2
|
||||||
|
available_gb: 6
|
||||||
|
arch: "x86_64"
|
||||||
|
cpu_backend: "llama.cpp"
|
||||||
|
|
||||||
|
presets:
|
||||||
|
# ─── TIER 1: Conservative (fits comfortably) ──────────────────────
|
||||||
|
tiny:
|
||||||
|
name: "tiny-2b-q4"
|
||||||
|
description: "2B param model, Q4_K_M — leaves headroom for other processes"
|
||||||
|
model_size_gb: 1.5
|
||||||
|
quantization: "Q4_K_M"
|
||||||
|
context_tokens: 4096
|
||||||
|
kv_type: "f16"
|
||||||
|
estimated_ram_gb: 2.8
|
||||||
|
fits_in_allegro: true
|
||||||
|
server_flags:
|
||||||
|
threads: 2
|
||||||
|
context: 4096
|
||||||
|
batch: 256
|
||||||
|
expected_perf:
|
||||||
|
tokens_per_sec: "8-15"
|
||||||
|
ttft_s: "1.5-3.0"
|
||||||
|
use_case: "Simple Q&A, short completions, triage"
|
||||||
|
ollama_model: "qwen2.5:1.5b"
|
||||||
|
llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
|
||||||
|
|
||||||
|
small:
|
||||||
|
name: "small-3b-q4"
|
||||||
|
description: "3B param model, Q4_K_M — sweet spot for value on 2 cores"
|
||||||
|
model_size_gb: 2.0
|
||||||
|
quantization: "Q4_K_M"
|
||||||
|
context_tokens: 8192
|
||||||
|
kv_type: "turbo2"
|
||||||
|
estimated_ram_gb: 3.6
|
||||||
|
fits_in_allegro: true
|
||||||
|
server_flags:
|
||||||
|
threads: 2
|
||||||
|
context: 8192
|
||||||
|
batch: 512
|
||||||
|
ctk: "q4_0"
|
||||||
|
ctv: "q4_0"
|
||||||
|
expected_perf:
|
||||||
|
tokens_per_sec: "5-10"
|
||||||
|
ttft_s: "2.0-5.0"
|
||||||
|
use_case: "Code generation, tool calling, burn-loop workers"
|
||||||
|
ollama_model: "qwen2.5:3b"
|
||||||
|
llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf"
|
||||||
|
|
||||||
|
# ─── TIER 2: Balanced (recommended default) ───────────────────────
|
||||||
|
medium:
|
||||||
|
name: "medium-7b-q4"
|
||||||
|
description: "7B param model, Q4_K_M + TurboQuant — best quality that fits"
|
||||||
|
model_size_gb: 4.1
|
||||||
|
quantization: "Q4_K_M"
|
||||||
|
context_tokens: 8192
|
||||||
|
kv_type: "turbo4"
|
||||||
|
estimated_ram_gb: 5.2
|
||||||
|
fits_in_allegro: true
|
||||||
|
server_flags:
|
||||||
|
threads: 2
|
||||||
|
context: 8192
|
||||||
|
batch: 512
|
||||||
|
ctk: "q4_0"
|
||||||
|
ctv: "q4_0"
|
||||||
|
layer_adaptive: 7
|
||||||
|
expected_perf:
|
||||||
|
tokens_per_sec: "2-5"
|
||||||
|
ttft_s: "4.0-8.0"
|
||||||
|
use_case: "Complex reasoning, multi-turn conversation, analysis"
|
||||||
|
ollama_model: "qwen2.5:7b"
|
||||||
|
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
|
||||||
|
|
||||||
|
medium_long:
|
||||||
|
name: "medium-7b-q4-long"
|
||||||
|
description: "7B Q4 + aggressive TurboQuant for 32K context"
|
||||||
|
model_size_gb: 4.1
|
||||||
|
quantization: "Q4_K_M"
|
||||||
|
context_tokens: 32768
|
||||||
|
kv_type: "turbo4"
|
||||||
|
estimated_ram_gb: 5.8
|
||||||
|
fits_in_allegro: true
|
||||||
|
server_flags:
|
||||||
|
threads: 2
|
||||||
|
context: 32768
|
||||||
|
batch: 256
|
||||||
|
ctk: "q3_k"
|
||||||
|
ctv: "q3_k"
|
||||||
|
layer_adaptive: 7
|
||||||
|
expected_perf:
|
||||||
|
tokens_per_sec: "1.5-4"
|
||||||
|
ttft_s: "6.0-12.0"
|
||||||
|
use_case: "Long document analysis, code review, research"
|
||||||
|
ollama_model: "qwen2.5:7b"
|
||||||
|
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
|
||||||
|
|
||||||
|
# ─── TIER 3: Pushing limits (may swap) ────────────────────────────
|
||||||
|
large:
|
||||||
|
name: "large-14b-q3"
|
||||||
|
description: "14B param model, Q3_K_M — may page to swap, use with caution"
|
||||||
|
model_size_gb: 6.5
|
||||||
|
quantization: "Q3_K_M"
|
||||||
|
context_tokens: 4096
|
||||||
|
kv_type: "turbo4"
|
||||||
|
estimated_ram_gb: 7.2
|
||||||
|
fits_in_allegro: false
|
||||||
|
warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs."
|
||||||
|
server_flags:
|
||||||
|
threads: 2
|
||||||
|
context: 4096
|
||||||
|
batch: 256
|
||||||
|
ctk: "q3_k"
|
||||||
|
ctv: "q3_k"
|
||||||
|
layer_adaptive: 7
|
||||||
|
expected_perf:
|
||||||
|
tokens_per_sec: "0.5-2"
|
||||||
|
ttft_s: "10.0-30.0"
|
||||||
|
use_case: "Batch processing, overnight jobs (with swap)"
|
||||||
|
ollama_model: "qwen2.5:14b"
|
||||||
|
llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf"
|
||||||
|
|
||||||
|
# Recommended default for Allegro
|
||||||
|
recommended_preset: "medium"
|
||||||
|
|
||||||
|
# Server startup examples
|
||||||
|
examples:
|
||||||
|
ollama: |
|
||||||
|
# Pull and run
|
||||||
|
ollama pull qwen2.5:7b
|
||||||
|
ollama run qwen2.5:7b
|
||||||
|
|
||||||
|
llama_cpp: |
|
||||||
|
# With TurboQuant KV cache
|
||||||
|
export TURBO_LAYER_ADAPTIVE=7
|
||||||
|
llama-server \
|
||||||
|
-m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
|
||||||
|
--port 8081 \
|
||||||
|
-t 2 \
|
||||||
|
-c 8192 \
|
||||||
|
-b 512 \
|
||||||
|
-ctk q4_0 -ctv q4_0 \
|
||||||
|
--host 0.0.0.0
|
||||||
|
|
||||||
|
hermes_profile: |
|
||||||
|
# Use with hermes agent
|
||||||
|
hermes -p allegro-medium chat
|
||||||
Reference in New Issue
Block a user