turboquant/ggml/include/ggml.h

//
// ggml.h — ggml tensor library public API
// (Integration layer for llama.cpp fork with TurboQuant QJL support)
//
// This file extends ggml with custom types for TurboQuant KV compression.
// It mirrors the standard llama.cpp ggml.h structure with additions.
//

#ifndef GGML_H
#define GGML_H

#include <stddef.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

// ==================== ggml_type ====================
// Standard llama.cpp tensor types (subset shown, actual full list in original)
// Values must match upstream to maintain ABI compatibility
// Add custom types beyond GGML_TYPE_COUNT (0x100 boundary) for forks
typedef enum {
    GGML_TYPE_F32     = 0,   // float32, 4 bytes
    GGML_TYPE_F16     = 1,   // float16, 2 bytes
    GGML_TYPE_Q4_0    = 2,   // 4-bit, 0.5 bytes (blockwise)
    GGML_TYPE_Q4_1    = 3,   // 4-bit with per-block scale
    GGML_TYPE_Q5_0    = 4,   // 5-bit
    GGML_TYPE_Q5_1    = 5,   // 5-bit with scale
    GGML_TYPE_Q8_0    = 8,   // 8-bit
    GGML_TYPE_Q8_1    = 9,   // 8-bit with per-block scale
    GGML_TYPE_Q2_K    = 10,  // 2-bit, 256-level codebook
    GGML_TYPE_Q3_K    = 11,  // 3-bit, 256-level codebook
    GGML_TYPE_Q4_K    = 12,  // 4-bit, K-quant (superblock)
    GGML_TYPE_Q5_K    = 13,  // 5-bit, K-quant
    GGML_TYPE_Q6_K    = 14,  // 6-bit, K-quant
    GGML_TYPE_Q8_K    = 15,  // 8-bit, K-quant
    // ... more upstream types including IQ types ...

    // ==================== TURBOQUANT CUSTOM TYPES ====================
    // These values use the 0x100+ custom range reserved for fork extensions
    // They do not collide with upstream ggml_type values.

    GGML_TYPE_TURBO2   = 0x100,   // 2.0-bit TurboQuant (PolarQuant only)
    GGML_TYPE_TURBO3   = 0x101,   // 3.0-bit TurboQuant (PolarQuant only)
    GGML_TYPE_TURBO4   = 0x102,   // 4.0-bit TurboQuant (PolarQuant only)

    // Full TurboQuant — PolarQuant (4-bit) + QJL residual correction
    // Effective: ~3.5 bits/channel, zero accuracy loss
    // Storage per 128-dim vector: 64B (polar indices) + 8B (signs) + 4B (scale) = 76B
    GGML_TYPE_TURBOQUANT_QJL = 0x103,

    // Count of all types (custom boundary)
    GGML_TYPE_COUNT    = 0x104
} ggml_type;

// ==================== GGML tensor structure ====================
// Forward declaration — actual definition resides in ggml-internal.h
// We only need type tags here; the tensor layout additions go in llama.cpp
struct ggml_tensor;

// ==================== QJL-specific constants ====================
// These match the QJL kernel definitions in ggml/src/ggml-metal.metal

#define GGML_QJL_PROJ_DIM          64   // Projection dimension (m)
#define GGML_QJL_PROJ_DIM_PACKED   8    // Bytes per sign array (64 bits → 8 bytes)
#define GGML_QJL_SIGN_EXTRA        8    // Bytes for signs per vector
#define GGML_QJL_SCALE_EXTRA       4    // Bytes for scale factor per vector (float)
#define GGML_QJL_TOTAL_EXTRA       12   // Total QJL metadata overhead per vector

// QJL scale factor defaults (for residual correction magnitude)
#define GGML_QJL_DEFAULT_SCALE     1.0f

// ==================== Integration layer ====================
// Helper: determine whether a tensor uses QJL storage
static inline bool ggml_is_qjl_type(ggml_type type) {
    return type == GGML_TYPE_TURBOQUANT_QJL;
}

// Helper: compute per-vector storage breakdown for QJL
// Returns tuple of (bytes_polar, bytes_qjl_signs, bytes_qjl_scale)
static inline void ggml_qjl_storage_breakdown(int * polar_bytes, int * qjl_sign_bytes, int * qjl_scale_bytes) {
    // PolarQuant part: 4 bits per coordinate → d/2 bytes (for d=128, that's 64 bytes)
    // QJL part: 8 bytes signs + 4 bytes scale = 12 bytes
    *polar_bytes    = 64;  // hardcoded for d=128; code should validate d==128
    *qjl_sign_bytes = GGML_QJL_SIGN_EXTRA;
    *qjl_scale_bytes = GGML_QJL_SCALE_EXTRA;
}

#ifdef __cplusplus
}
#endif

#endif // GGML_H