Timmy-time-dashboard/static/local_llm.js

/**
 * local_llm.js — In-browser LLM inference via WebLLM.
 *
 * Loads a small language model directly into the browser using WebGPU
 * (or WASM fallback) so Timmy can run on an iPhone with zero server
 * dependency.  Falls back to server-side Ollama when the local model
 * is unavailable.
 *
 * Usage:
 *   const llm = new LocalLLM({ modelId, onProgress, onReady, onError });
 *   await llm.init();
 *   const reply = await llm.chat("Hello Timmy");
 */

/* global webllm */

// ── Model catalogue ────────────────────────────────────────────────────────
// Models tested on iPhone 15 Pro / Safari 26+.  Sorted smallest → largest.
const MODEL_CATALOGUE = [
  {
    id: "SmolLM2-360M-Instruct-q4f16_1-MLC",
    label: "SmolLM2 360M (fast)",
    sizeHint: "~200 MB",
    description: "Fastest option. Good for simple Q&A.",
  },
  {
    id: "Qwen2.5-0.5B-Instruct-q4f16_1-MLC",
    label: "Qwen 2.5 0.5B (balanced)",
    sizeHint: "~350 MB",
    description: "Best quality under 500 MB.",
  },
  {
    id: "SmolLM2-1.7B-Instruct-q4f16_1-MLC",
    label: "SmolLM2 1.7B (smart)",
    sizeHint: "~1 GB",
    description: "Highest quality. Needs more memory.",
  },
  {
    id: "Llama-3.2-1B-Instruct-q4f16_1-MLC",
    label: "Llama 3.2 1B",
    sizeHint: "~700 MB",
    description: "Meta's compact model. Good all-rounder.",
  },
];

// ── Capability detection ──────────────────────────────────────────────────
function detectWebGPU() {
  return typeof navigator !== "undefined" && "gpu" in navigator;
}

function detectWASM() {
  try {
    return typeof WebAssembly === "object" && typeof WebAssembly.instantiate === "function";
  } catch {
    return false;
  }
}

// ── LocalLLM class ────────────────────────────────────────────────────────
class LocalLLM {
  /**
   * @param {object}   opts
   * @param {string}   opts.modelId     — WebLLM model ID
   * @param {function} opts.onProgress  — (report) progress during download
   * @param {function} opts.onReady     — () called when model is loaded
   * @param {function} opts.onError     — (error) called on fatal error
   * @param {string}   opts.systemPrompt — system message for the model
   */
  constructor(opts = {}) {
    this.modelId = opts.modelId || "SmolLM2-360M-Instruct-q4f16_1-MLC";
    this.onProgress = opts.onProgress || (() => {});
    this.onReady = opts.onReady || (() => {});
    this.onError = opts.onError || (() => {});
    this.systemPrompt =
      opts.systemPrompt ||
      "You are a local AI assistant running in the browser. You are helpful and concise. " +
      "Keep responses brief on mobile.";

    this.engine = null;
    this.ready = false;
    this.loading = false;
    this._hasWebGPU = detectWebGPU();
    this._hasWASM = detectWASM();
  }

  /** Check if local inference is possible on this device. */
  static isSupported() {
    return detectWebGPU() || detectWASM();
  }

  /** Return the model catalogue for UI rendering. */
  static getCatalogue() {
    return MODEL_CATALOGUE;
  }

  /** Return runtime capability info. */
  getCapabilities() {
    return {
      webgpu: this._hasWebGPU,
      wasm: this._hasWASM,
      supported: this._hasWebGPU || this._hasWASM,
      backend: this._hasWebGPU ? "WebGPU" : this._hasWASM ? "WASM" : "none",
    };
  }

  /**
   * Initialize the engine and download/cache the model.
   * Model weights are cached in the browser's Cache API so subsequent
   * loads are nearly instant.
   */
  async init() {
    if (this.ready) return;
    if (this.loading) return;

    if (!this._hasWebGPU && !this._hasWASM) {
      const err = new Error(
        "Neither WebGPU nor WebAssembly is available. " +
        "Update to iOS 26+ / Safari 26+ for WebGPU support."
      );
      this.onError(err);
      throw err;
    }

    this.loading = true;

    try {
      // Dynamic import of WebLLM from CDN (avoids bundling)
      if (typeof webllm === "undefined") {
        await this._loadWebLLMScript();
      }

      const initProgressCallback = (report) => {
        this.onProgress(report);
      };

      this.engine = await webllm.CreateMLCEngine(this.modelId, {
        initProgressCallback,
      });

      this.ready = true;
      this.loading = false;
      this.onReady();
    } catch (err) {
      this.loading = false;
      this.ready = false;
      this.onError(err);
      throw err;
    }
  }

  /**
   * Send a chat message and get a response.
   * @param {string} userMessage
   * @param {object} opts
   * @param {function} opts.onToken — streaming callback (delta)
   * @returns {Promise<string>} full response text
   */
  async chat(userMessage, opts = {}) {
    if (!this.ready) {
      throw new Error("Model not loaded. Call init() first.");
    }

    const messages = [
      { role: "system", content: this.systemPrompt },
      { role: "user", content: userMessage },
    ];

    if (opts.onToken) {
      // Streaming mode
      let fullText = "";
      const chunks = await this.engine.chat.completions.create({
        messages,
        stream: true,
        temperature: 0.7,
        max_tokens: 512,
      });

      for await (const chunk of chunks) {
        const delta = chunk.choices[0]?.delta?.content || "";
        fullText += delta;
        opts.onToken(delta, fullText);
      }
      return fullText;
    }

    // Non-streaming mode
    const response = await this.engine.chat.completions.create({
      messages,
      temperature: 0.7,
      max_tokens: 512,
    });

    return response.choices[0]?.message?.content || "";
  }

  /** Reset conversation context. */
  async resetChat() {
    if (this.engine) {
      await this.engine.resetChat();
    }
  }

  /** Unload the model and free memory. */
  async unload() {
    if (this.engine) {
      await this.engine.unload();
      this.engine = null;
      this.ready = false;
    }
  }

  /** Get current engine stats (tokens/sec, memory, etc). */
  async getStats() {
    if (!this.engine) return null;
    try {
      const stats = await this.engine.runtimeStatsText();
      return stats;
    } catch {
      return null;
    }
  }

  // ── Private ─────────────────────────────────────────────────────────────

  /** Load the WebLLM script from CDN. */
  _loadWebLLMScript() {
    return new Promise((resolve, reject) => {
      // Check if already loaded
      if (typeof webllm !== "undefined") {
        resolve();
        return;
      }
      const script = document.createElement("script");
      script.src =
        "https://esm.run/@anthropic-ai/sdk" !== script.src
          ? "https://esm.run/@anthropic-ai/sdk"
          : "";
      // Use the WebLLM CDN bundle
      script.type = "module";
      script.textContent = `
        import * as webllmModule from "https://esm.run/@mlc-ai/web-llm";
        window.webllm = webllmModule;
        window.dispatchEvent(new Event("webllm-loaded"));
      `;
      document.head.appendChild(script);

      const onLoaded = () => {
        window.removeEventListener("webllm-loaded", onLoaded);
        resolve();
      };
      window.addEventListener("webllm-loaded", onLoaded);

      // Fallback: also try the UMD bundle approach
      const fallbackScript = document.createElement("script");
      fallbackScript.src = "https://cdn.jsdelivr.net/npm/@mlc-ai/web-llm@0.2.80/lib/index.min.js";
      fallbackScript.onload = () => {
        if (typeof webllm !== "undefined") {
          resolve();
        }
      };
      fallbackScript.onerror = () => {
        reject(new Error("Failed to load WebLLM library from CDN."));
      };
      document.head.appendChild(fallbackScript);
    });
  }
}

// Export for use in templates
window.LocalLLM = LocalLLM;
window.LOCAL_MODEL_CATALOGUE = MODEL_CATALOGUE;