Files
timmy-config/wizards/bezalel/llama-server.sh
Timmy Time e6c5129a94 feat: resurrect Bezalel with Gemma 4 + llama.cpp stack
- Add EPIC.md with resurrection plan
- Create Hermes profile with Bezalel persona
- Add llama-server.sh for Gemma 4 inference
- Update start_bezalel.sh with stack checks
- Add README with quick start guide

Backend: llama.cpp
Model: Gemma 4 26B MoE (Apache 2.0)
Frontend: Hermes profile

No OpenAI. No cloud. Pure sovereign stack.
2026-04-02 20:12:21 +00:00

58 lines
1.4 KiB
Bash
Executable File

#!/bin/bash
# llama.cpp Server for Bezalel — Gemma 4 26B MoE
# Start this before running Hermes
set -e
MODEL_PATH="${MODEL_PATH:-/opt/models/gemma-4-26b-moe-Q4_K_M.gguf}"
HOST="${LLAMA_HOST:-0.0.0.0}"
PORT="${LLAMA_PORT:-8080}"
CONTEXT="${LLAMA_CONTEXT:-8192}"
GPU_LAYERS="${LLAMA_GPU_LAYERS:-99}"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
echo -e "${GREEN}=== Bezalel llama.cpp Server ===${NC}"
echo "Model: Gemma 4 26B MoE"
echo "Path: $MODEL_PATH"
echo "Host: $HOST:$PORT"
echo "Context: $CONTEXT"
echo "GPU Layers: $GPU_LAYERS"
echo ""
# Check model exists
if [ ! -f "$MODEL_PATH" ]; then
echo -e "${RED}ERROR: Model not found at $MODEL_PATH${NC}"
echo "Download with:"
echo " huggingface-cli download google/gemma-4-26b-moe-GGUF --local-dir /opt/models/"
exit 1
fi
# Check llama-server exists
if ! command -v llama-server &> /dev/null; then
echo -e "${RED}ERROR: llama-server not found${NC}"
echo "Build llama.cpp with:"
echo " cmake -B build -DGGML_CUDA=ON"
echo " cmake --build build --config Release -j$(nproc)"
exit 1
fi
echo -e "${YELLOW}Starting server...${NC}"
echo ""
exec llama-server \
--model "$MODEL_PATH" \
--host "$HOST" \
--port "$PORT" \
--ctx-size "$CONTEXT" \
--n-gpu-layers "$GPU_LAYERS" \
--threads 8 \
--batch-size 512 \
--timeout 300 \
--metrics \
--verbose