From 70d292c222c46c8c8b3e8e126d08fc0e8c873ba8 Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexander@alexanderwhitestone.com>
Date: Thu, 16 Apr 2026 01:50:50 +0000
Subject: [PATCH] feat: add Allegro VPS preset configurations (#95)

---
 profiles/allegro-cpu-presets.yaml | 164 ++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 profiles/allegro-cpu-presets.yaml

diff --git a/profiles/allegro-cpu-presets.yaml b/profiles/allegro-cpu-presets.yaml
new file mode 100644
index 00000000..b049f53a
--- /dev/null
+++ b/profiles/allegro-cpu-presets.yaml
@@ -0,0 +1,164 @@
+# Allegro VPS Presets — 2 cores, 8GB RAM, CPU-only inference
+# Optimized for the Timmy Foundation Allegro server (167.99.126.228)
+#
+# Hardware constraints:
+#   - 2 CPU cores (no GPU)
+#   - 8GB RAM total
+#   - ~2GB reserved for OS + hermes agent
+#   - ~6GB available for model + KV cache
+#
+# Strategy: GGUF quantization via llama.cpp (CPU-optimized)
+# KV cache compression via TurboQuant to maximize context within RAM
+
+hardware:
+  hostname: "allegro"
+  ip: "167.99.126.228"
+  cores: 2
+  ram_gb: 8
+  gpu: false
+  os_reserved_gb: 2
+  available_gb: 6
+  arch: "x86_64"
+  cpu_backend: "llama.cpp"
+
+presets:
+  # ─── TIER 1: Conservative (fits comfortably) ──────────────────────
+  tiny:
+    name: "tiny-2b-q4"
+    description: "2B param model, Q4_K_M — leaves headroom for other processes"
+    model_size_gb: 1.5
+    quantization: "Q4_K_M"
+    context_tokens: 4096
+    kv_type: "f16"
+    estimated_ram_gb: 2.8
+    fits_in_allegro: true
+    server_flags:
+      threads: 2
+      context: 4096
+      batch: 256
+    expected_perf:
+      tokens_per_sec: "8-15"
+      ttft_s: "1.5-3.0"
+      use_case: "Simple Q&A, short completions, triage"
+    ollama_model: "qwen2.5:1.5b"
+    llama_cpp_model: "qwen2.5-1.5b-instruct-q4_k_m.gguf"
+
+  small:
+    name: "small-3b-q4"
+    description: "3B param model, Q4_K_M — sweet spot for value on 2 cores"
+    model_size_gb: 2.0
+    quantization: "Q4_K_M"
+    context_tokens: 8192
+    kv_type: "turbo2"
+    estimated_ram_gb: 3.6
+    fits_in_allegro: true
+    server_flags:
+      threads: 2
+      context: 8192
+      batch: 512
+      ctk: "q4_0"
+      ctv: "q4_0"
+    expected_perf:
+      tokens_per_sec: "5-10"
+      ttft_s: "2.0-5.0"
+      use_case: "Code generation, tool calling, burn-loop workers"
+    ollama_model: "qwen2.5:3b"
+    llama_cpp_model: "qwen2.5-3b-instruct-q4_k_m.gguf"
+
+  # ─── TIER 2: Balanced (recommended default) ───────────────────────
+  medium:
+    name: "medium-7b-q4"
+    description: "7B param model, Q4_K_M + TurboQuant — best quality that fits"
+    model_size_gb: 4.1
+    quantization: "Q4_K_M"
+    context_tokens: 8192
+    kv_type: "turbo4"
+    estimated_ram_gb: 5.2
+    fits_in_allegro: true
+    server_flags:
+      threads: 2
+      context: 8192
+      batch: 512
+      ctk: "q4_0"
+      ctv: "q4_0"
+      layer_adaptive: 7
+    expected_perf:
+      tokens_per_sec: "2-5"
+      ttft_s: "4.0-8.0"
+      use_case: "Complex reasoning, multi-turn conversation, analysis"
+    ollama_model: "qwen2.5:7b"
+    llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
+
+  medium_long:
+    name: "medium-7b-q4-long"
+    description: "7B Q4 + aggressive TurboQuant for 32K context"
+    model_size_gb: 4.1
+    quantization: "Q4_K_M"
+    context_tokens: 32768
+    kv_type: "turbo4"
+    estimated_ram_gb: 5.8
+    fits_in_allegro: true
+    server_flags:
+      threads: 2
+      context: 32768
+      batch: 256
+      ctk: "q3_k"
+      ctv: "q3_k"
+      layer_adaptive: 7
+    expected_perf:
+      tokens_per_sec: "1.5-4"
+      ttft_s: "6.0-12.0"
+      use_case: "Long document analysis, code review, research"
+    ollama_model: "qwen2.5:7b"
+    llama_cpp_model: "qwen2.5-7b-instruct-q4_k_m.gguf"
+
+  # ─── TIER 3: Pushing limits (may swap) ────────────────────────────
+  large:
+    name: "large-14b-q3"
+    description: "14B param model, Q3_K_M — may page to swap, use with caution"
+    model_size_gb: 6.5
+    quantization: "Q3_K_M"
+    context_tokens: 4096
+    kv_type: "turbo4"
+    estimated_ram_gb: 7.2
+    fits_in_allegro: false
+    warning: "Exceeds 6GB limit. Needs swap or will OOM. Use only for batch jobs."
+    server_flags:
+      threads: 2
+      context: 4096
+      batch: 256
+      ctk: "q3_k"
+      ctv: "q3_k"
+      layer_adaptive: 7
+    expected_perf:
+      tokens_per_sec: "0.5-2"
+      ttft_s: "10.0-30.0"
+      use_case: "Batch processing, overnight jobs (with swap)"
+    ollama_model: "qwen2.5:14b"
+    llama_cpp_model: "qwen2.5-14b-instruct-q3_k_m.gguf"
+
+# Recommended default for Allegro
+recommended_preset: "medium"
+
+# Server startup examples
+examples:
+  ollama: |
+    # Pull and run
+    ollama pull qwen2.5:7b
+    ollama run qwen2.5:7b
+
+  llama_cpp: |
+    # With TurboQuant KV cache
+    export TURBO_LAYER_ADAPTIVE=7
+    llama-server \
+      -m /models/qwen2.5-7b-instruct-q4_k_m.gguf \
+      --port 8081 \
+      -t 2 \
+      -c 8192 \
+      -b 512 \
+      -ctk q4_0 -ctv q4_0 \
+      --host 0.0.0.0
+
+  hermes_profile: |
+    # Use with hermes agent
+    hermes -p allegro-medium chat