the-nexus/edge-intelligence.js

// ═══════════════════════════════════════════
// EDGE INTELLIGENCE — Browser-side LLM
// ═══════════════════════════════════════════
// Inference priority:
//   1. WebLLM (WebGPU) — SmolLM2-360M, near-zero latency once loaded
//   2. Transformers.js (WASM/CPU) — LaMini-Flan-T5-77M, broader device support
//   3. Signals caller to use Ollama backend
// All modes are local-first; no server round-trip for simple queries.

const WEBLLM_MODEL = 'SmolLM2-360M-Instruct-q0f16-MLC';
const TRANSFORMERS_MODEL = 'Xenova/LaMini-Flan-T5-77M';

const SYSTEM_PROMPT =
  'You are Timmy, a sovereign lizard-wizard AI in The Nexus. ' +
  'Be concise, witty, and helpful. Keep replies under 2 sentences.';

// ─── State ─────────────────────────────────
let _engine       = null;
let _mode         = 'uninitialized'; // 'webllm' | 'transformers' | 'ollama' | 'uninitialized'
let _statusCb     = null;
let _isLoading    = false;
let _isReady      = false;

// ─── Public API ────────────────────────────

/**
 * Begin loading the best available edge model.
 * @param {(state: string, text: string) => void} statusCallback
 */
async function init(statusCallback) {
  if (_isLoading || _isReady) return;
  _isLoading = true;
  _statusCb  = statusCallback ?? (() => {});

  _setStatus('loading', 'Probing edge capabilities…');

  if (await _tryWebLLM()) { _isLoading = false; _isReady = true; return; }
  if (await _tryTransformers()) { _isLoading = false; _isReady = true; return; }

  // Both failed — signal Ollama fallback
  _mode = 'ollama';
  _isLoading = false;
  _setStatus('fallback', 'Edge AI: Ollama backend');
}

/**
 * Run inference on a user message.
 * Returns a string if handled locally, null if caller should use Ollama.
 * @param {string} userMessage
 * @returns {Promise<string|null>}
 */
async function query(userMessage) {
  if (!_isReady) return null;

  try {
    if (_mode === 'webllm' && _engine) {
      const result = await _engine.chat.completions.create({
        messages: [
          { role: 'system', content: SYSTEM_PROMPT },
          { role: 'user',   content: userMessage },
        ],
        max_tokens:  120,
        temperature: 0.7,
        stream:      false,
      });
      return result.choices[0].message.content.trim();
    }

    if (_mode === 'transformers' && _engine) {
      const out = await _engine(userMessage, {
        max_new_tokens: 80,
        do_sample:      true,
        temperature:    0.7,
      });
      const raw = out[0]?.generated_text ?? '';
      // Strip the input echo that some seq2seq models return
      return raw.replace(userMessage, '').trim() || raw.trim();
    }
  } catch (err) {
    console.warn('[EdgeAI] Inference error:', err);
  }

  return null; // caller should fall back to Ollama
}

/** True once a model is loaded and ready. */
function isReady() { return _isReady; }

/** Current inference mode string. */
function getMode() { return _mode; }

// ─── Private helpers ───────────────────────

async function _tryWebLLM() {
  if (!navigator.gpu) return false;
  try {
    _setStatus('loading', 'Initializing WebLLM (WebGPU)…');
    const { CreateMLCEngine } = await import('https://esm.run/@mlc-ai/web-llm');
    _engine = await CreateMLCEngine(WEBLLM_MODEL, {
      initProgressCallback: (p) => {
        const pct = Math.round((p.progress ?? 0) * 100);
        _setStatus('loading', `Loading SmolLM2: ${pct}%`);
      },
    });
    _mode = 'webllm';
    _setStatus('ready', 'Edge AI: WebGPU ⚡');
    return true;
  } catch (err) {
    console.warn('[EdgeAI] WebLLM unavailable:', err.message ?? err);
    return false;
  }
}

async function _tryTransformers() {
  try {
    _setStatus('loading', 'Initializing edge model (CPU)…');
    const { pipeline, env } = await import(
      'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2'
    );
    // Use local cache; allow remote model hub
    env.allowLocalModels  = false;
    env.allowRemoteModels = true;

    _engine = await pipeline('text2text-generation', TRANSFORMERS_MODEL, {
      progress_callback: (info) => {
        if (info.status === 'downloading') {
          const pct = info.total ? Math.round((info.loaded / info.total) * 100) : 0;
          _setStatus('loading', `Downloading model: ${pct}%`);
        }
      },
    });
    _mode = 'transformers';
    _setStatus('ready', 'Edge AI: CPU ◈');
    return true;
  } catch (err) {
    console.warn('[EdgeAI] Transformers.js unavailable:', err.message ?? err);
    return false;
  }
}

function _setStatus(state, text) {
  _statusCb(state, text);
}

export const EdgeIntelligence = { init, query, isReady, getMode };