feat: add Allegro VPS preset configurations (#95)

This commit is contained in:
2026-04-16 01:50:50 +00:00
parent 3cd8750cbb
commit 70d292c222

View File

@@ -0,0 +1,164 @@
# Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference
# Optimized for the Timmy Foundation Allegro server (167.99.126.228)
#
# Hardware constraints:
# - 2 CPU cores (no GPU)
# - 8GB RAM total
# - ~2GB reserved for OS + hermes agent
# - ~6GB available for model + KV cache
#
# Strategy: GGUF quantization via llama.cpp (CPU-optimized)
# KV cache compression via TurboQuant to maximize context within RAM
hardware:
hostname: "allegro"
ip: "167.99.126.228"
cores: 2
ram_gb: 8
gpu: false
os_reserved_gb: 2
available_gb: 6
arch: "x86_64"
cpu_backend: "llama.cpp"
presets:
# ─── TIER 1: Conservative (fits comfortably) ──────────────────────
tiny:
name: "tiny-2b-q4"
description: "2B param model, Q4_K_M — leaves headroom for other processes"
model_size_gb: 1.5
quantization: "Q4_K_M"
context_tokens: 4096
kv_type: "f16"
estimated_ram_gb: 2.8
fits_in_allegro: true
server_flags:
threads: 2
context: 4096
batch: 256
expected_perf:
tokens_per_sec: "8-15"
ttft_s: "1.5-3.0"
use_case: "Simple Q&A, short completions, triage"
ollama_model: "qwen2.5:1.5b"
llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
small:
name: "small-3b-q4"
description: "3B param model, Q4_K_M — sweet spot for value on 2 cores"
model_size_gb: 2.0
quantization: "Q4_K_M"
context_tokens: 8192
kv_type: "turbo2"
estimated_ram_gb: 3.6
fits_in_allegro: true
server_flags:
threads: 2
context: 8192
batch: 512
ctk: "q4_0"
ctv: "q4_0"
expected_perf:
tokens_per_sec: "5-10"
ttft_s: "2.0-5.0"
use_case: "Code generation, tool calling, burn-loop workers"
ollama_model: "qwen2.5:3b"
llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf"
# ─── TIER 2: Balanced (recommended default) ───────────────────────
medium:
name: "medium-7b-q4"
description: "7B param model, Q4_K_M + TurboQuant — best quality that fits"
model_size_gb: 4.1
quantization: "Q4_K_M"
context_tokens: 8192
kv_type: "turbo4"
estimated_ram_gb: 5.2
fits_in_allegro: true
server_flags:
threads: 2
context: 8192
batch: 512
ctk: "q4_0"
ctv: "q4_0"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "2-5"
ttft_s: "4.0-8.0"
use_case: "Complex reasoning, multi-turn conversation, analysis"
ollama_model: "qwen2.5:7b"
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
medium_long:
name: "medium-7b-q4-long"
description: "7B Q4 + aggressive TurboQuant for 32K context"
model_size_gb: 4.1
quantization: "Q4_K_M"
context_tokens: 32768
kv_type: "turbo4"
estimated_ram_gb: 5.8
fits_in_allegro: true
server_flags:
threads: 2
context: 32768
batch: 256
ctk: "q3_k"
ctv: "q3_k"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "1.5-4"
ttft_s: "6.0-12.0"
use_case: "Long document analysis, code review, research"
ollama_model: "qwen2.5:7b"
llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
# ─── TIER 3: Pushing limits (may swap) ────────────────────────────
large:
name: "large-14b-q3"
description: "14B param model, Q3_K_M — may page to swap, use with caution"
model_size_gb: 6.5
quantization: "Q3_K_M"
context_tokens: 4096
kv_type: "turbo4"
estimated_ram_gb: 7.2
fits_in_allegro: false
warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs."
server_flags:
threads: 2
context: 4096
batch: 256
ctk: "q3_k"
ctv: "q3_k"
layer_adaptive: 7
expected_perf:
tokens_per_sec: "0.5-2"
ttft_s: "10.0-30.0"
use_case: "Batch processing, overnight jobs (with swap)"
ollama_model: "qwen2.5:14b"
llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf"
# Recommended default for Allegro
recommended_preset: "medium"
# Server startup examples
examples:
ollama: |
# Pull and run
ollama pull qwen2.5:7b
ollama run qwen2.5:7b
llama_cpp: |
# With TurboQuant KV cache
export TURBO_LAYER_ADAPTIVE=7
llama-server \
-m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
--port 8081 \
-t 2 \
-c 8192 \
-b 512 \
-ctk q4_0 -ctv q4_0 \
--host 0.0.0.0
hermes_profile: |
# Use with hermes agent
hermes -p allegro-medium chat