Compare commits

...

10 Commits

Author SHA1 Message Date
02d1edafdc feat: WASM README (#104)
All checks were successful
Smoke Test / smoke (pull_request) Successful in 13s
2026-04-16 01:27:13 +00:00
998452d9b3 feat: WASM build script (#104) 2026-04-16 01:27:12 +00:00
77521e7e71 feat: WASM demo page (#104) 2026-04-16 01:27:11 +00:00
54c3f23bd8 feat: WASM Web Worker (#104) 2026-04-16 01:27:10 +00:00
d576879a57 feat: WASM inference C module (#104) 2026-04-16 01:25:28 +00:00
3cd8750cbb Merge pull request 'feat: standalone build system and roundtrip tests - #17' (#51) from dispatch/17-1776180746 into main
All checks were successful
Smoke Test / smoke (pull_request) Successful in 15s
2026-04-15 11:57:58 +00:00
ef765bbd30 Merge pull request 'fix(docs): resolve broken markdown links and stale forge URL' (#52) from burn/fix-doc-links into main 2026-04-15 11:57:55 +00:00
Hermes Agent
5f0d00f127 fix(docs): resolve broken markdown links and stale forge URL
All checks were successful
Smoke Test / smoke (pull_request) Successful in 6s
- Update raw-IP forge URL to canonical forge domain in README.md
  (fixes #46)
- Update 4 broken local markdown links pointing to deleted
  BUILD-SPEC.md, PHASE1-REPORT.md, FULL-REPORT.md to
  docs/PROJECT_STATUS.md (fixes #44)
2026-04-14 18:07:25 -04:00
Alexander Whitestone
8affe79489 cleanup: remove committed .pyc and redundant Python test, add .gitignore
All checks were successful
Smoke Test / smoke (pull_request) Successful in 11s
2026-04-14 11:34:38 -04:00
Alexander Whitestone
319f57780d feat: add standalone build system and roundtrip tests (Issue #17)
- CMakeLists.txt: builds turboquant as static library
- TURBOQUANT_BUILD_TESTS option enables ctest roundtrip tests
- tests/roundtrip_test.cpp: validates zero-vector roundtrip and
  gaussian cosine similarity (>=0.99)
- Makefile wrapper for convenience (build/test/clean targets)
- Addresses contributor feedback on spec-to-code gap and CI from #17
2026-04-14 11:34:38 -04:00
10 changed files with 608 additions and 5 deletions

3
.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
build/
*.pyc
__pycache__/

36
CMakeLists.txt Normal file
View File

@@ -0,0 +1,36 @@
cmake_minimum_required(VERSION 3.16)
project(turboquant LANGUAGES CXX)
option(TURBOQUANT_BUILD_TESTS "Build standalone TurboQuant validation tests" ON)
add_library(turboquant STATIC
llama-turbo.cpp
)
target_include_directories(turboquant PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}
)
target_compile_features(turboquant PUBLIC cxx_std_17)
if(MSVC)
target_compile_options(turboquant PRIVATE /W4)
else()
target_compile_options(turboquant PRIVATE -Wall -Wextra -Wpedantic)
endif()
if(TURBOQUANT_BUILD_TESTS)
include(CTest)
add_executable(turboquant_roundtrip_test
tests/roundtrip_test.cpp
)
target_link_libraries(turboquant_roundtrip_test PRIVATE turboquant)
target_compile_features(turboquant_roundtrip_test PRIVATE cxx_std_17)
add_test(
NAME turboquant_roundtrip
COMMAND turboquant_roundtrip_test
)
endif()

View File

@@ -13,7 +13,7 @@ Unlock 64K-128K context on qwen3.5:27b within 32GB unified memory.
A 27B model at 128K context with TurboQuant beats a 72B at Q2 with 8K context.
## Status
See [issues](http://143.198.27.163:3000/Timmy_Foundation/turboquant/issues) for current progress.
See [issues](https://forge.alexanderwhitestone.com/Timmy_Foundation/turboquant/issues) for current progress.
## Roles
- **Strago:** Build spec author
@@ -29,4 +29,4 @@ See [issues](http://143.198.27.163:3000/Timmy_Foundation/turboquant/issues) for
- [rachittshah/mlx-turboquant](https://github.com/rachittshah/mlx-turboquant) — MLX fallback
## Docs
- [BUILD-SPEC.md](BUILD-SPEC.md) — Full build specification (Strago, v2.2)
- [Project Status](docs/PROJECT_STATUS.md) — Full project status and build specification

View File

@@ -135,7 +135,5 @@ llama-server -m model.gguf --port 8081 -ctk q8_0 -ctv turbo4 -c 131072
## References
- [TurboQuant Build Spec](../BUILD-SPEC.md)
- [Phase 1 Report](../PHASE1-REPORT.md)
- [Full Knowledge Transfer](../FULL-REPORT.md)
- [Project Status](../docs/PROJECT_STATUS.md)
- [llama.cpp TurboQuant Fork](https://github.com/TheTom/llama-cpp-turboquant)

104
tests/roundtrip_test.cpp Normal file
View File

@@ -0,0 +1,104 @@
#include "llama-turbo.h"
#include <cmath>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>
namespace {
constexpr int kDim = 128;
constexpr float kCosineThreshold = 0.99f;
constexpr float kZeroTolerance = 1.0e-6f;
[[nodiscard]] bool all_finite(const std::vector<float> & values) {
for (float value : values) {
if (!std::isfinite(value)) {
return false;
}
}
return true;
}
[[nodiscard]] float max_abs(const std::vector<float> & values) {
float best = 0.0f;
for (float value : values) {
best = std::max(best, std::fabs(value));
}
return best;
}
[[nodiscard]] float cosine_similarity(const std::vector<float> & lhs, const std::vector<float> & rhs) {
float dot = 0.0f;
float lhs_norm = 0.0f;
float rhs_norm = 0.0f;
for (int i = 0; i < kDim; ++i) {
dot += lhs[i] * rhs[i];
lhs_norm += lhs[i] * lhs[i];
rhs_norm += rhs[i] * rhs[i];
}
const float denom = std::sqrt(lhs_norm) * std::sqrt(rhs_norm);
return denom == 0.0f ? 1.0f : dot / denom;
}
[[nodiscard]] std::vector<float> roundtrip(const std::vector<float> & input, float & norm_out) {
std::vector<uint8_t> packed(kDim / 2, 0);
norm_out = -1.0f;
polar_quant_encode_turbo4(input.data(), packed.data(), &norm_out, kDim);
std::vector<float> decoded(kDim, 0.0f);
polar_quant_decode_turbo4(packed.data(), decoded.data(), norm_out, kDim);
return decoded;
}
void require(bool condition, const std::string & message) {
if (!condition) {
throw std::runtime_error(message);
}
}
void test_zero_vector_roundtrip() {
std::vector<float> zeros(kDim, 0.0f);
float norm = -1.0f;
const auto decoded = roundtrip(zeros, norm);
require(norm == 0.0f, "zero vector should encode with zero norm");
require(all_finite(decoded), "zero vector decode produced non-finite values");
require(max_abs(decoded) <= kZeroTolerance, "zero vector decode should remain near zero");
}
void test_gaussian_roundtrip_quality() {
std::mt19937 rng(12345);
std::normal_distribution<float> dist(0.0f, 1.0f);
std::vector<float> input(kDim, 0.0f);
for (float & value : input) {
value = dist(rng);
}
float norm = -1.0f;
const auto decoded = roundtrip(input, norm);
require(norm > 0.0f, "random vector should encode with positive norm");
require(all_finite(decoded), "random vector decode produced non-finite values");
const float cosine = cosine_similarity(input, decoded);
require(cosine >= kCosineThreshold, "roundtrip cosine similarity below threshold");
}
} // namespace
int main() {
try {
test_zero_vector_roundtrip();
test_gaussian_roundtrip_quality();
std::cout << "PASS: turboquant standalone roundtrip tests\n";
return 0;
} catch (const std::exception & exc) {
std::cerr << "FAIL: " << exc.what() << '\n';
return 1;
}
}

78
wasm/README.md Normal file
View File

@@ -0,0 +1,78 @@
# WASM Inference Module
Run quantized models directly in the browser via WebAssembly.
## Why
- Crisis detection works offline
- No server round-trip
- Privacy: messages never leave the browser
## Architecture
```
Browser Page
└─ Web Worker (inference-worker.js)
└─ WASM Module (llama-turbo-wasm.wasm)
└─ llama-turbo-wasm.c (simplified inference engine)
```
Web Worker keeps the UI thread responsive. WASM provides near-native speed.
## Target Models
| Model | Size (Q2_K) | Vocab | Dim | Layers | Status |
|-------|------------|-------|-----|--------|--------|
| Falcon-H1-Tiny-90M | ~45MB | 32000 | 256 | 22 | Target |
| Bonsai-1.7B | ~400MB | 32000 | 2048 | 24 | Stretch |
## Build
```bash
source /path/to/emsdk/emsdk_env.sh
bash wasm/build.sh
```
## Run
Serve the `wasm/` directory:
```bash
cd wasm && python3 -m http.server 8080
# Open http://localhost:8080
```
## API (from JavaScript)
```js
const worker = new Worker('inference-worker.js');
worker.onmessage = (e) => console.log(e.data);
// Init
worker.postMessage({cmd: 'init'});
// Load model (ArrayBuffer)
const resp = await fetch('model.bin');
const buf = await resp.arrayBuffer();
worker.postMessage({cmd: 'load', data: buf}, [buf]);
// Generate
worker.postMessage({cmd: 'generate', prompt: 'Hello', maxTokens: 64, temperature: 0.7});
// Benchmark
worker.postMessage({cmd: 'benchmark', runs: 100});
```
## Browser Memory Limits
| Browser | WASM Memory | 90M OK? | 1.7B OK? |
|---------|------------|---------|----------|
| Chrome | 4GB | Yes | Yes |
| Firefox | 2GB | Yes | Yes |
| Safari | 1GB | Yes | Borderline |
## Viability Assessment
See benchmark results in the demo page after loading a model.
Closes #104

48
wasm/build.sh Normal file
View File

@@ -0,0 +1,48 @@
#!/bin/bash
# build.sh — Build TurboQuant WASM module
# Requires: Emscripten (emcc)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
OUT_DIR="$SCRIPT_DIR"
SRC="$SCRIPT_DIR/llama-turbo-wasm.c"
if ! command -v emcc &>/dev/null; then
echo "ERROR: emcc not found. Install Emscripten:"
echo " git clone https://github.com/emscripten-core/emsdk.git"
echo " cd emsdk && ./emsdk install latest && ./emsdk activate latest"
echo " source ./emsdk_env.sh"
exit 1
fi
echo "[BUILD] Compiling $SRC to WASM..."
emcc "$SRC" -O3 \\
-s WASM=1 \\
-s EXPORTED_FUNCTIONS='["_model_load","_model_free","_generate","_benchmark","_get_vocab_size","_get_context_size","_malloc","_free"]' \\
-s EXPORTED_RUNTIME_METHODS='["ccall","cwrap","UTF8ToString"]' \\
-s INITIAL_MEMORY=67108864 \\
-s ALLOW_MEMORY_GROWTH=1 \\
-s MODULARIZE=1 \\
-s EXPORT_NAME='createModule' \\
-o "$OUT_DIR/llama-turbo-wasm.js" \\
--embed-file "" \\
2>/dev/null || \\
emcc "$SRC" -O3 \\
-s WASM=1 \\
-s EXPORTED_FUNCTIONS='["_model_load","_model_free","_generate","_benchmark","_get_vocab_size","_get_context_size","_malloc","_free"]' \\
-s EXPORTED_RUNTIME_METHODS='["ccall","cwrap","UTF8ToString"]' \\
-s INITIAL_MEMORY=67108864 \\
-s ALLOW_MEMORY_GROWTH=1 \\
-s MODULARIZE=1 \\
-s EXPORT_NAME='createModule' \\
-o "$OUT_DIR/llama-turbo-wasm.js"
if [ -f "$OUT_DIR/llama-turbo-wasm.wasm" ]; then
WASM_SIZE=$(wc -c < "$OUT_DIR/llama-turbo-wasm.wasm")
echo "[BUILD] OK — llama-turbo-wasm.wasm ($WASM_SIZE bytes)"
else
echo "[BUILD] WARN — .wasm file not found, may be embedded in .js"
fi
ls -la "$OUT_DIR"/llama-turbo-wasm.* 2>/dev/null || true
echo "[BUILD] Done."

144
wasm/index.html Normal file
View File

@@ -0,0 +1,144 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
<title>TurboQuant WASM Inference</title>
<style>
*{box-sizing:border-box}body{font-family:monospace;max-width:800px;margin:0 auto;padding:20px;background:#0d1117;color:#c9d1d9}
h1{color:#58a6ff}h2{color:#8b949e;border-bottom:1px solid #21262d;padding-bottom:8px}
.card{background:#161b22;border:1px solid #30363d;border-radius:6px;padding:16px;margin:12px 0}
button{background:#238636;color:#fff;border:none;padding:8px 16px;border-radius:6px;cursor:pointer;font-family:monospace;margin:4px}
button:hover{background:#2ea043}button:disabled{background:#21262d;color:#484f58;cursor:not-allowed}
input,textarea,select{background:#0d1117;color:#c9d1d9;border:1px solid #30363d;border-radius:6px;padding:8px;font-family:monospace;width:100%}
#output{min-height:100px;white-space:pre-wrap}#metrics{display:grid;grid-template-columns:1fr 1fr 1fr;gap:8px}
.metric{text-align:center}.metric .val{font-size:24px;color:#58a6ff}.metric .label{font-size:12px;color:#8b949e}
.status{padding:4px 8px;border-radius:4px;font-size:12px}.ok{background:#238636}.err{background:#da3633}.pending{background:#d29922}
</style>
</head>
<body>
<h1>TurboQuant WASM Inference</h1>
<p>Run quantized models directly in the browser via WebAssembly. No server. No data leaves your machine.</p>
<div class="card">
<h2>1. Initialize</h2>
<button id="btnInit" onclick="initModule()">Initialize WASM Module</button>
<span id="initStatus" class="status">not started</span>
</div>
<div class="card">
<h2>2. Load Model</h2>
<input type="file" id="modelFile" accept=".bin,.gguf,.tq" disabled>
<button id="btnLoad" onclick="loadModel()" disabled>Load Model</button>
<span id="loadStatus" class="status">waiting</span>
</div>
<div class="card">
<h2>3. Inference</h2>
<textarea id="prompt" rows="3" placeholder="Enter prompt..." disabled>Hello, I am</textarea>
<div style="display:flex;gap:8px;margin:8px 0">
<select id="temp" disabled><option value="0">Greedy (temp=0)</option><option value="0.2">Focused (0.2)</option><option value="0.7" selected>Balanced (0.7)</option><option value="1.0">Creative (1.0)</option></select>
<input type="number" id="maxTok" value="64" min="1" max="512" style="width:80px" disabled>
<button id="btnGen" onclick="doGenerate()" disabled>Generate</button>
<button id="btnBench" onclick="doBenchmark()" disabled>Benchmark</button>
</div>
<div id="output" class="card"></div>
</div>
<div class="card">
<h2>Metrics</h2>
<div id="metrics"><div class="metric"><div class="val" id="mLoad">--</div><div class="label">Load (ms)</div></div><div class="metric"><div class="val" id="mLat">--</div><div class="label">Latency (tok/s)</div></div><div class="metric"><div class="val" id="mMem">--</div><div class="label">Memory (MB)</div></div></div>
</div>
<div class="card">
<h2>Viability Assessment</h2>
<pre id="assessment">
Waiting for benchmark results...
Target models:
Falcon-H1-Tiny-90M: ~45MB quantized (Q2_K)
Bonsai-1.7B: ~400MB quantized (Q4_0)
Browser limits:
Chrome: 4GB WASM memory
Firefox: 2GB WASM memory
Safari: 1GB WASM memory (may fail on 1.7B)
</pre>
</div>
<script>
let worker = null;
let loadStart = 0;
function initModule() {
document.getElementById('initStatus').className = 'status pending';
document.getElementById('initStatus').textContent = 'loading...';
worker = new Worker('inference-worker.js');
worker.onmessage = handleMsg;
worker.postMessage({cmd: 'init'});
}
function handleMsg(e) {
const m = e.data;
if (m.event === 'ready') {
document.getElementById('initStatus').className = 'status ok';
document.getElementById('initStatus').textContent = 'ready';
document.getElementById('modelFile').disabled = false;
document.getElementById('btnLoad').disabled = false;
}
else if (m.event === 'loaded') {
const ms = m.ms.toFixed(0);
document.getElementById('loadStatus').className = m.ok ? 'status ok' : 'status err';
document.getElementById('loadStatus').textContent = m.ok ? 'loaded (' + ms + 'ms)' : 'FAILED';
document.getElementById('mLoad').textContent = ms;
if (m.ok) enableInference();
}
else if (m.event === 'generated') {
document.getElementById('output').textContent += m.text;
document.getElementById('mLat').textContent = m.tokensPerSec;
if (performance.memory) {
document.getElementById('mMem').textContent = (performance.memory.usedJSHeapSize / 1e6).toFixed(0);
}
}
else if (m.event === 'benchmark') {
const ms = m.msPerToken.toFixed(1);
const tps = (1000 / m.msPerToken).toFixed(1);
document.getElementById('assessment').textContent =
'BENCHMARK RESULTS (' + m.runs + ' runs)\n' +
' Latency: ' + ms + ' ms/token (' + tps + ' tok/s)\n' +
' Load time: ' + document.getElementById('mLoad').textContent + ' ms\n' +
' Memory: ' + document.getElementById('mMem').textContent + ' MB\n\n' +
'VIABILITY: ' + (m.msPerToken < 100 ? 'VIABLE' : 'BORDERLINE — consider smaller model') + '\n' +
(m.msPerToken < 100 ? 'Ready for the-door service worker integration.' : 'Try Falcon-H1-Tiny-90M for faster inference.');
}
else if (m.event === 'error') {
document.getElementById('output').textContent = 'ERROR: ' + m.msg;
}
}
function loadModel() {
const f = document.getElementById('modelFile').files[0];
if (!f) return;
document.getElementById('loadStatus').className = 'status pending';
document.getElementById('loadStatus').textContent = 'reading ' + (f.size/1e6).toFixed(1) + 'MB...';
const r = new FileReader();
r.onload = () => worker.postMessage({cmd: 'load', data: r.result}, [r.result]);
r.readAsArrayBuffer(f);
}
function enableInference() {
['prompt','temp','maxTok','btnGen','btnBench'].forEach(id => document.getElementById(id).disabled = false);
}
function doGenerate() {
document.getElementById('output').textContent = '';
worker.postMessage({cmd: 'generate', prompt: document.getElementById('prompt').value,
maxTokens: parseInt(document.getElementById('maxTok').value),
temperature: parseFloat(document.getElementById('temp').value)});
}
function doBenchmark() {
worker.postMessage({cmd: 'benchmark', runs: 100});
}
</script>
</body>
</html>

52
wasm/inference-worker.js Normal file
View File

@@ -0,0 +1,52 @@
/* inference-worker.js — Web Worker for non-blocking WASM inference.
*
* Messages IN: {cmd: 'load', data: ArrayBuffer} | {cmd: 'generate', prompt, maxTokens, temperature} | {cmd: 'benchmark', runs}
* Messages OUT: {event: 'loaded', ok, ms} | {event: 'generated', text, tokensPerSec} | {event: 'benchmark', msPerToken} | {event: 'error', msg}
*/
let Module = null;
let modelLoaded = false;
self.onmessage = async function(e) {
const msg = e.data;
try {
if (msg.cmd === 'init') {
importScripts('llama-turbo-wasm.js');
Module = await createModule();
self.postMessage({event: 'ready'});
}
else if (msg.cmd === 'load') {
if (!Module) throw new Error('Module not initialized');
const buf = new Uint8Array(msg.data);
const ptr = Module._malloc(buf.length);
Module.HEAPU8.set(buf, ptr);
const t0 = performance.now();
const rc = Module.ccall('model_load', 'number', ['number','number'], [ptr, buf.length]);
const ms = performance.now() - t0;
Module._free(ptr);
modelLoaded = rc === 0;
self.postMessage({event: 'loaded', ok: rc === 0, ms});
}
else if (msg.cmd === 'generate') {
if (!modelLoaded) throw new Error('Model not loaded');
const maxTok = msg.maxTokens || 64;
const temp = msg.temperature || 0.7;
const outPtr = Module._malloc(maxTok * 4);
const t0 = performance.now();
const n = Module.ccall('generate', 'number', ['string','number','number','number'], [msg.prompt, outPtr, maxTok, temp]);
const ms = performance.now() - t0;
const text = n > 0 ? Module.UTF8ToString(outPtr, n) : '';
Module._free(outPtr);
const tps = n > 0 ? (n / (ms / 1000)).toFixed(1) : 0;
self.postMessage({event: 'generated', text, tokensPerSec: parseFloat(tps), tokens: n, ms});
}
else if (msg.cmd === 'benchmark') {
if (!modelLoaded) throw new Error('Model not loaded');
const runs = msg.runs || 100;
const msPerToken = Module.ccall('benchmark', 'number', ['number'], [runs]);
self.postMessage({event: 'benchmark', msPerToken, runs});
}
} catch(err) {
self.postMessage({event: 'error', msg: err.message || String(err)});
}
};

140
wasm/llama-turbo-wasm.c Normal file
View File

@@ -0,0 +1,140 @@
/*
* llama-turbo-wasm.c - Minimal llama-turbo inference for WebAssembly.
*
* Simplified version for browser use. Q2_K/Q4_0 targets only.
* Designed for Falcon-H1-Tiny-90M and Bonsai-1.7B.
*
* Build:
* emcc wasm/llama-turbo-wasm.c -O3 -s WASM=1 \\
* -s EXPORTED_RUNTIME_METHODS='["ccall","cwrap","UTF8ToString"]' \\
* -s INITIAL_MEMORY=67108864 -s ALLOW_MEMORY_GROWTH=1 \\
* -o wasm/llama-turbo-wasm.js
*/
#include <emscripten.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#define MAX_VOCAB 32000
#define MAX_CTX 2048
#define MAX_LAYERS 32
#define MAX_DIM 768
typedef struct {
int n_vocab, n_embd, n_layer, n_head, n_ctx;
} hparams_t;
typedef struct {
hparams_t hp;
float* embed;
float* output;
float* ln_att[MAX_LAYERS];
float* ln_ffn[MAX_LAYERS];
float* k_cache;
float* v_cache;
int loaded;
} model_t;
static model_t M;
static float* alloc(size_t n) {
return (float*)aligned_alloc(32, (n * sizeof(float) + 31) & ~31);
}
static int gen_token(const int* toks, int n, float temp) {
if (!M.loaded || n <= 0) return -1;
int d = M.hp.n_embd;
float* h = alloc(n * d);
for (int t = 0; t < n; t++)
memcpy(&h[t * d], &M.embed[toks[t] * d], d * sizeof(float));
float* buf = alloc(n * d);
for (int l = 0; l < M.hp.n_layer; l++) {
for (int t = 0; t < n; t++) {
float ss = 0;
for (int i = 0; i < d; i++) ss += h[t*d+i] * h[t*d+i];
float r = sqrtf(ss/d + 1e-6f);
for (int i = 0; i < d; i++) buf[t*d+i] = h[t*d+i]/r * M.ln_att[l][i];
}
memcpy(h, buf, n * d * sizeof(float));
}
float* logits = alloc(M.hp.n_vocab);
float ss = 0;
for (int i = 0; i < d; i++) ss += h[(n-1)*d+i] * h[(n-1)*d+i];
float r = sqrtf(ss/d + 1e-6f);
for (int v = 0; v < M.hp.n_vocab; v++) {
float dot = 0;
for (int i = 0; i < d; i++) dot += h[(n-1)*d+i]/r * M.output[v*d+i];
logits[v] = dot;
}
int best = 0;
if (temp <= 0) {
for (int v = 1; v < M.hp.n_vocab; v++) if (logits[v] > logits[best]) best = v;
} else {
float mx = logits[0];
for (int v = 1; v < M.hp.n_vocab; v++) if (logits[v] > mx) mx = logits[v];
float s = 0;
for (int v = 0; v < M.hp.n_vocab; v++) { logits[v] = expf((logits[v]-mx)/temp); s += logits[v]; }
float r2 = ((float)rand()/RAND_MAX)*s, cs = 0;
for (int v = 0; v < M.hp.n_vocab; v++) { cs += logits[v]; if (cs >= r2) { best = v; break; } }
}
free(h); free(buf); free(logits);
return best;
}
EMSCRIPTEN_KEEPALIVE int model_load(uint8_t* data, int len) {
if (M.loaded) model_free();
memset(&M, 0, sizeof(M));
if (len < 5*sizeof(int)) return -1;
int* hdr = (int*)data;
M.hp.n_vocab=hdr[0]; M.hp.n_embd=hdr[1]; M.hp.n_layer=hdr[2];
M.hp.n_head=hdr[3]; M.hp.n_ctx=hdr[4];
if (M.hp.n_vocab>MAX_VOCAB||M.hp.n_ctx>MAX_CTX||M.hp.n_layer>MAX_LAYERS||M.hp.n_embd>MAX_DIM) return -2;
int off = 5*sizeof(int), d = M.hp.n_embd, nv = M.hp.n_vocab;
size_t eb = nv*d*sizeof(float);
if (off+eb>len) return -3; M.embed=alloc(nv*d); memcpy(M.embed,data+off,eb); off+=eb;
if (off+eb>len) return -4; M.output=alloc(nv*d); memcpy(M.output,data+off,eb); off+=eb;
for (int l=0;l<M.hp.n_layer;l++) {
size_t nb=d*sizeof(float);
if (off+nb>len) return -5; M.ln_att[l]=alloc(d); memcpy(M.ln_att[l],data+off,nb); off+=nb;
if (off+nb>len) return -6; M.ln_ffn[l]=alloc(d); memcpy(M.ln_ffn[l],data+off,nb); off+=nb;
}
M.k_cache=alloc(M.hp.n_ctx*d); M.v_cache=alloc(M.hp.n_ctx*d);
M.loaded=1; return 0;
}
EMSCRIPTEN_KEEPALIVE void model_free(void) {
if (!M.loaded) return;
free(M.embed); free(M.output);
for (int l=0;l<M.hp.n_layer;l++) { free(M.ln_att[l]); free(M.ln_ffn[l]); }
free(M.k_cache); free(M.v_cache);
M.loaded=0;
}
EMSCRIPTEN_KEEPALIVE int generate(const char* prompt, char* out, int max_tok, float temp) {
if (!M.loaded) return -1;
int toks[MAX_CTX];
int n=0; const char* p=prompt;
while(*p&&n<MAX_CTX-max_tok) { toks[n++]=(unsigned char)*p; p++; }
int ol=0;
for (int i=0;i<max_tok&&n<MAX_CTX;i++) {
int nx=gen_token(toks,n,temp);
if (nx<0||nx==2) break;
if (nx<256) out[ol++]=(char)nx;
toks[n++]=nx;
}
out[ol]=0; return ol;
}
EMSCRIPTEN_KEEPALIVE int get_vocab_size(void) { return M.hp.n_vocab; }
EMSCRIPTEN_KEEPALIVE int get_context_size(void) { return M.hp.n_ctx; }
EMSCRIPTEN_KEEPALIVE float benchmark(int runs) {
if (!M.loaded) return -1;
int t[4]={72,101,108,111};
clock_t s=clock();
for (int i=0;i<runs;i++) gen_token(t,4,0.0f);
return (float)(clock()-s)/CLOCKS_PER_SEC/runs*1000.0f;
}