include/llama.h

//
// llama.h — Stub header for reference integration build
//
#ifndef LLAMA_H
#define LLAMA_H

#include <cstddef>
#include <cstdint>

struct llama_context {};

struct ggml_tensor;  // forward

typedef struct llama_kv_cache {
    int n;
    int d;
    void * data;
    int   type;  // using int instead of enum to avoid ABI issues
    float * qjl_scales;
    uint8_t * qjl_signs;
    float * qjl_proj;
} llama_kv_cache;

// Minimal ggml_type values needed for integration
#define GGML_TYPE_F32  0
#define GGML_TYPE_F16  1
#define GGML_TYPE_Q4_0 2
#define GGML_TYPE_TURBOQUANT_QJL  0x103

#endif // LLAMA_H
feat: integrate QJL Metal kernels into llama.cpp fork KV cache Adds complete QJL (Johnson–Lindenstrauss residual correction) Metal GPU kernel integration: - ggml/include/ggml.h: add GGML_TYPE_TURBOQUANT_QJL type and helpers - ggml/src/ggml-metal.metal: QJL encode/decode kernel signatures - ggml/src/ggml-metal.m: Metal PSO registration + proper dispatch - src/llama.cpp: KV allocation, projection matrix, fused decode path - CMakeLists.txt: build all components with Metal support - include/llama.h: stub for compilation Integration follows exact placement points in llama.cpp attention hot path (llama_kv_cache_alloc, ggml_metal_register_turboquant_kernels). Closes #133 2026-04-26 09:29:58 -04:00			`//`
			`// llama.h — Stub header for reference integration build`
			`//`
			`#ifndef LLAMA_H`
			`#define LLAMA_H`

			`#include <cstddef>`
			`#include <cstdint>`

			`struct llama_context {};`

			`struct ggml_tensor; // forward`

			`typedef struct llama_kv_cache {`
			`int n;`
			`int d;`
			`void * data;`
			`int type; // using int instead of enum to avoid ABI issues`
			`float * qjl_scales;`
			`uint8_t * qjl_signs;`
			`float * qjl_proj;`
			`} llama_kv_cache;`

			`// Minimal ggml_type values needed for integration`
			`#define GGML_TYPE_F32 0`
			`#define GGML_TYPE_F16 1`
			`#define GGML_TYPE_Q4_0 2`
			`#define GGML_TYPE_TURBOQUANT_QJL 0x103`

			`#endif // LLAMA_H`