#!/usr/bin/env bash set -euo pipefail MODEL_DIR="$HOME/models/gemma4-llamacpp" MODEL_FILE="$MODEL_DIR/gemma-4-e4b-it-Q4_K_M.gguf" REPO_ID="ggml-org/gemma-4-E4B-it-GGUF" FILE_NAME="gemma-4-e4b-it-Q4_K_M.gguf" SERVER_BIN="$HOME/turboquant/llama-cpp-fork/build/bin/llama-server" PORT="8081" HOST="127.0.0.1" mkdir -p "$MODEL_DIR" if ! command -v hf >/dev/null 2>&1; then echo "hf CLI not found. Install huggingface_hub first." exit 1 fi if [ ! -x "$SERVER_BIN" ]; then echo "llama-server not found at: $SERVER_BIN" exit 1 fi if [ ! -f "$MODEL_FILE" ]; then echo "[Gemma4-llama.cpp] Downloading real Gemma 4 GGUF from $REPO_ID" echo "[Gemma4-llama.cpp] Target: $MODEL_FILE" hf download "$REPO_ID" "$FILE_NAME" --local-dir "$MODEL_DIR" fi echo printf '[Gemma4-llama.cpp] Model: %s\n' "$MODEL_FILE" printf '[Gemma4-llama.cpp] Server: %s\n' "$SERVER_BIN" printf '[Gemma4-llama.cpp] Listen: http://%s:%s/v1\n' "$HOST" "$PORT" echo '[Gemma4-llama.cpp] Starting llama-server with turbo4 KV cache...' echo export TURBO_LAYER_ADAPTIVE=7 exec "$SERVER_BIN" \ --jinja \ -m "$MODEL_FILE" \ --host "$HOST" \ --port "$PORT" \ -ngl 99 \ -c 8192 \ -ctk turbo4 \ -ctv turbo4