- Add EPIC.md with resurrection plan - Create Hermes profile with Bezalel persona - Add llama-server.sh for Gemma 4 inference - Update start_bezalel.sh with stack checks - Add README with quick start guide Backend: llama.cpp Model: Gemma 4 26B MoE (Apache 2.0) Frontend: Hermes profile No OpenAI. No cloud. Pure sovereign stack.
58 lines
1.4 KiB
Bash
Executable File
58 lines
1.4 KiB
Bash
Executable File
#!/bin/bash
|
|
# llama.cpp Server for Bezalel — Gemma 4 26B MoE
|
|
# Start this before running Hermes
|
|
|
|
set -e
|
|
|
|
MODEL_PATH="${MODEL_PATH:-/opt/models/gemma-4-26b-moe-Q4_K_M.gguf}"
|
|
HOST="${LLAMA_HOST:-0.0.0.0}"
|
|
PORT="${LLAMA_PORT:-8080}"
|
|
CONTEXT="${LLAMA_CONTEXT:-8192}"
|
|
GPU_LAYERS="${LLAMA_GPU_LAYERS:-99}"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m'
|
|
|
|
echo -e "${GREEN}=== Bezalel llama.cpp Server ===${NC}"
|
|
echo "Model: Gemma 4 26B MoE"
|
|
echo "Path: $MODEL_PATH"
|
|
echo "Host: $HOST:$PORT"
|
|
echo "Context: $CONTEXT"
|
|
echo "GPU Layers: $GPU_LAYERS"
|
|
echo ""
|
|
|
|
# Check model exists
|
|
if [ ! -f "$MODEL_PATH" ]; then
|
|
echo -e "${RED}ERROR: Model not found at $MODEL_PATH${NC}"
|
|
echo "Download with:"
|
|
echo " huggingface-cli download google/gemma-4-26b-moe-GGUF --local-dir /opt/models/"
|
|
exit 1
|
|
fi
|
|
|
|
# Check llama-server exists
|
|
if ! command -v llama-server &> /dev/null; then
|
|
echo -e "${RED}ERROR: llama-server not found${NC}"
|
|
echo "Build llama.cpp with:"
|
|
echo " cmake -B build -DGGML_CUDA=ON"
|
|
echo " cmake --build build --config Release -j$(nproc)"
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "${YELLOW}Starting server...${NC}"
|
|
echo ""
|
|
|
|
exec llama-server \
|
|
--model "$MODEL_PATH" \
|
|
--host "$HOST" \
|
|
--port "$PORT" \
|
|
--ctx-size "$CONTEXT" \
|
|
--n-gpu-layers "$GPU_LAYERS" \
|
|
--threads 8 \
|
|
--batch-size 512 \
|
|
--timeout 300 \
|
|
--metrics \
|
|
--verbose
|