From 70d292c222c46c8c8b3e8e126d08fc0e8c873ba8 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Thu, 16 Apr 2026 01:50:50 +0000 Subject: [PATCH] feat: add Allegro VPS preset configurations (#95) --- profiles/allegro-cpu-presets.yaml | 164 ++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 profiles/allegro-cpu-presets.yaml diff --git a/profiles/allegro-cpu-presets.yaml b/profiles/allegro-cpu-presets.yaml new file mode 100644 index 00000000..b049f53a --- /dev/null +++ b/profiles/allegro-cpu-presets.yaml @@ -0,0 +1,164 @@ +# Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference +# Optimized for the Timmy Foundation Allegro server (167.99.126.228) +# +# Hardware constraints: +# - 2 CPU cores (no GPU) +# - 8GB RAM total +# - ~2GB reserved for OS + hermes agent +# - ~6GB available for model + KV cache +# +# Strategy: GGUF quantization via llama.cpp (CPU-optimized) +# KV cache compression via TurboQuant to maximize context within RAM + +hardware: + hostname: "allegro" + ip: "167.99.126.228" + cores: 2 + ram_gb: 8 + gpu: false + os_reserved_gb: 2 + available_gb: 6 + arch: "x86_64" + cpu_backend: "llama.cpp" + +presets: + # ─── TIER 1: Conservative (fits comfortably) ────────────────────── + tiny: + name: "tiny-2b-q4" + description: "2B param model, Q4_K_M — leaves headroom for other processes" + model_size_gb: 1.5 + quantization: "Q4_K_M" + context_tokens: 4096 + kv_type: "f16" + estimated_ram_gb: 2.8 + fits_in_allegro: true + server_flags: + threads: 2 + context: 4096 + batch: 256 + expected_perf: + tokens_per_sec: "8-15" + ttft_s: "1.5-3.0" + use_case: "Simple Q&A, short completions, triage" + ollama_model: "qwen2.5:1.5b" + llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf" + + small: + name: "small-3b-q4" + description: "3B param model, Q4_K_M — sweet spot for value on 2 cores" + model_size_gb: 2.0 + quantization: "Q4_K_M" + context_tokens: 8192 + kv_type: "turbo2" + estimated_ram_gb: 3.6 + fits_in_allegro: true + server_flags: + threads: 2 + context: 8192 + batch: 512 + ctk: "q4_0" + ctv: "q4_0" + expected_perf: + tokens_per_sec: "5-10" + ttft_s: "2.0-5.0" + use_case: "Code generation, tool calling, burn-loop workers" + ollama_model: "qwen2.5:3b" + llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf" + + # ─── TIER 2: Balanced (recommended default) ─────────────────────── + medium: + name: "medium-7b-q4" + description: "7B param model, Q4_K_M + TurboQuant — best quality that fits" + model_size_gb: 4.1 + quantization: "Q4_K_M" + context_tokens: 8192 + kv_type: "turbo4" + estimated_ram_gb: 5.2 + fits_in_allegro: true + server_flags: + threads: 2 + context: 8192 + batch: 512 + ctk: "q4_0" + ctv: "q4_0" + layer_adaptive: 7 + expected_perf: + tokens_per_sec: "2-5" + ttft_s: "4.0-8.0" + use_case: "Complex reasoning, multi-turn conversation, analysis" + ollama_model: "qwen2.5:7b" + llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf" + + medium_long: + name: "medium-7b-q4-long" + description: "7B Q4 + aggressive TurboQuant for 32K context" + model_size_gb: 4.1 + quantization: "Q4_K_M" + context_tokens: 32768 + kv_type: "turbo4" + estimated_ram_gb: 5.8 + fits_in_allegro: true + server_flags: + threads: 2 + context: 32768 + batch: 256 + ctk: "q3_k" + ctv: "q3_k" + layer_adaptive: 7 + expected_perf: + tokens_per_sec: "1.5-4" + ttft_s: "6.0-12.0" + use_case: "Long document analysis, code review, research" + ollama_model: "qwen2.5:7b" + llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf" + + # ─── TIER 3: Pushing limits (may swap) ──────────────────────────── + large: + name: "large-14b-q3" + description: "14B param model, Q3_K_M — may page to swap, use with caution" + model_size_gb: 6.5 + quantization: "Q3_K_M" + context_tokens: 4096 + kv_type: "turbo4" + estimated_ram_gb: 7.2 + fits_in_allegro: false + warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs." + server_flags: + threads: 2 + context: 4096 + batch: 256 + ctk: "q3_k" + ctv: "q3_k" + layer_adaptive: 7 + expected_perf: + tokens_per_sec: "0.5-2" + ttft_s: "10.0-30.0" + use_case: "Batch processing, overnight jobs (with swap)" + ollama_model: "qwen2.5:14b" + llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf" + +# Recommended default for Allegro +recommended_preset: "medium" + +# Server startup examples +examples: + ollama: | + # Pull and run + ollama pull qwen2.5:7b + ollama run qwen2.5:7b + + llama_cpp: | + # With TurboQuant KV cache + export TURBO_LAYER_ADAPTIVE=7 + llama-server \ + -m /models/qwen2.5-7b-instruct-q4_k_m.gguf \ + --port 8081 \ + -t 2 \ + -c 8192 \ + -b 512 \ + -ctk q4_0 -ctv q4_0 \ + --host 0.0.0.0 + + hermes_profile: | + # Use with hermes agent + hermes -p allegro-medium chat