timmy-config/wizards/bezalel/llama-server.sh

#!/bin/bash
# llama.cpp Server for Bezalel — Gemma 4 26B MoE
# Start this before running Hermes

set -e

MODEL_PATH="${MODEL_PATH:-/opt/models/gemma-4-26b-moe-Q4_K_M.gguf}"
HOST="${LLAMA_HOST:-0.0.0.0}"
PORT="${LLAMA_PORT:-8080}"
CONTEXT="${LLAMA_CONTEXT:-8192}"
GPU_LAYERS="${LLAMA_GPU_LAYERS:-99}"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

echo -e "${GREEN}=== Bezalel llama.cpp Server ===${NC}"
echo "Model: Gemma 4 26B MoE"
echo "Path: $MODEL_PATH"
echo "Host: $HOST:$PORT"
echo "Context: $CONTEXT"
echo "GPU Layers: $GPU_LAYERS"
echo ""

# Check model exists
if [ ! -f "$MODEL_PATH" ]; then
    echo -e "${RED}ERROR: Model not found at $MODEL_PATH${NC}"
    echo "Download with:"
    echo "  huggingface-cli download google/gemma-4-26b-moe-GGUF --local-dir /opt/models/"
    exit 1
fi

# Check llama-server exists
if ! command -v llama-server &> /dev/null; then
    echo -e "${RED}ERROR: llama-server not found${NC}"
    echo "Build llama.cpp with:"
    echo "  cmake -B build -DGGML_CUDA=ON"
    echo "  cmake --build build --config Release -j$(nproc)"
    exit 1
fi

echo -e "${YELLOW}Starting server...${NC}"
echo ""

exec llama-server \
    --model "$MODEL_PATH" \
    --host "$HOST" \
    --port "$PORT" \
    --ctx-size "$CONTEXT" \
    --n-gpu-layers "$GPU_LAYERS" \
    --threads 8 \
    --batch-size 512 \
    --timeout 300 \
    --metrics \
    --verbose